kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit b6320ef16497bcbfe26f0bd107c3f4b9ca3278a3
parent da99a8186349038c9d15c3e3f15a1b7f6b5975d3
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Thu, 21 Feb 2019 01:26:26 +0000

KFR 3.0.5

Diffstat:
M.gitignore | 2+-
MCHANGELOG.md | 22++++++++++++++++++++++
MCMakeLists.txt | 175+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mazure-pipelines.yml | 26++++++++++++++------------
Mcmake/arm.cmake | 4+++-
Acmake/detect_cpu.cpp | 10++++++++++
Acmake/target_set_arch.cmake | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dcmake/test_toolset/CMakeLists.txt | 3---
Mexamples/CMakeLists.txt | 23+++++++++++------------
Mexamples/biquads.cpp | 2++
Mexamples/fir.cpp | 2++
Mexamples/sample_rate_conversion.cpp | 2++
Mexamples/window.cpp | 2++
Minclude/kfr/all.hpp | 1-
Minclude/kfr/base.hpp | 33++++-----------------------------
Dinclude/kfr/base/abs.hpp | 49-------------------------------------------------
Dinclude/kfr/base/asin_acos.hpp | 67-------------------------------------------------------------------
Dinclude/kfr/base/atan.hpp | 107-------------------------------------------------------------------------------
Minclude/kfr/base/basic_expressions.hpp | 220+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Dinclude/kfr/base/bitwise.hpp | 136-------------------------------------------------------------------------------
Dinclude/kfr/base/clamp.hpp | 62--------------------------------------------------------------
Dinclude/kfr/base/comparison.hpp | 149-------------------------------------------------------------------------------
Dinclude/kfr/base/compiletime.hpp | 84-------------------------------------------------------------------------------
Dinclude/kfr/base/complex.hpp | 967-------------------------------------------------------------------------------
Dinclude/kfr/base/constants.hpp | 299-------------------------------------------------------------------------------
Minclude/kfr/base/conversion.hpp | 12++++++++----
Dinclude/kfr/base/digitreverse.hpp | 107-------------------------------------------------------------------------------
Minclude/kfr/base/expression.hpp | 219+++++++++++++++++++++++++++++++++++++++----------------------------------------
Minclude/kfr/base/filter.hpp | 9++++++---
Minclude/kfr/base/fraction.hpp | 3+--
Dinclude/kfr/base/function.hpp | 268-------------------------------------------------------------------------------
Ainclude/kfr/base/function_expressions.hpp | 30++++++++++++++++++++++++++++++
Dinclude/kfr/base/gamma.hpp | 60------------------------------------------------------------
Minclude/kfr/base/generators.hpp | 86++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Dinclude/kfr/base/horizontal.hpp | 119-------------------------------------------------------------------------------
Dinclude/kfr/base/hyperbolic.hpp | 120-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/abs.hpp | 126-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/asin_acos.hpp | 58----------------------------------------------------------
Dinclude/kfr/base/impl/atan.hpp | 229-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/clamp.hpp | 56--------------------------------------------------------
Dinclude/kfr/base/impl/gamma.hpp | 72------------------------------------------------------------------------
Dinclude/kfr/base/impl/hyperbolic.hpp | 100-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/log_exp.hpp | 315-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/logical.hpp | 289------------------------------------------------------------------------------
Dinclude/kfr/base/impl/min_max.hpp | 232-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/modzerobessel.hpp | 105-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/round.hpp | 255-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/saturation.hpp | 192-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/select.hpp | 261-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/sin_cos.hpp | 338-------------------------------------------------------------------------------
Dinclude/kfr/base/impl/sqrt.hpp | 71-----------------------------------------------------------------------
Dinclude/kfr/base/impl/tan.hpp | 141-------------------------------------------------------------------------------
Dinclude/kfr/base/intrinsics.h | 18------------------
Dinclude/kfr/base/kfr.h | 46----------------------------------------------
Dinclude/kfr/base/log_exp.hpp | 229-------------------------------------------------------------------------------
Dinclude/kfr/base/logical.hpp | 50--------------------------------------------------
Minclude/kfr/base/memory.hpp | 66+++++++++++++++++++++++++++++++++++++-----------------------------
Dinclude/kfr/base/min_max.hpp | 107-------------------------------------------------------------------------------
Dinclude/kfr/base/modzerobessel.hpp | 44--------------------------------------------
Dinclude/kfr/base/operators.hpp | 552-------------------------------------------------------------------------------
Dinclude/kfr/base/platform.hpp | 186-------------------------------------------------------------------------------
Minclude/kfr/base/pointer.hpp | 95+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Minclude/kfr/base/random.hpp | 86+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Dinclude/kfr/base/read_write.hpp | 239-------------------------------------------------------------------------------
Minclude/kfr/base/reduce.hpp | 72++++++++++++++++++++++++++++++++++++++++--------------------------------
Dinclude/kfr/base/round.hpp | 158-------------------------------------------------------------------------------
Dinclude/kfr/base/saturation.hpp | 62--------------------------------------------------------------
Dinclude/kfr/base/select.hpp | 57---------------------------------------------------------
Dinclude/kfr/base/shuffle.hpp | 625-------------------------------------------------------------------------------
Dinclude/kfr/base/simd_clang.hpp | 350-------------------------------------------------------------------------------
Dinclude/kfr/base/simd_intrin.hpp | 392-------------------------------------------------------------------------------
Dinclude/kfr/base/simd_x86.hpp | 272-------------------------------------------------------------------------------
Dinclude/kfr/base/sin_cos.hpp | 315-------------------------------------------------------------------------------
Minclude/kfr/base/small_buffer.hpp | 7+++----
Minclude/kfr/base/sort.hpp | 18+++++++++++-------
Dinclude/kfr/base/specializations.i | 109-------------------------------------------------------------------------------
Dinclude/kfr/base/sqrt.hpp | 50--------------------------------------------------
Dinclude/kfr/base/tan.hpp | 56--------------------------------------------------------
Dinclude/kfr/base/types.hpp | 429-------------------------------------------------------------------------------
Minclude/kfr/base/univector.hpp | 133++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Dinclude/kfr/base/vec.hpp | 1171-------------------------------------------------------------------------------
Minclude/kfr/cident.h | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Minclude/kfr/cometa.hpp | 560++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Minclude/kfr/cometa/array.hpp | 65+++++++++++++++++++++++++++++++++--------------------------------
Minclude/kfr/cometa/cstring.hpp | 50+++++++++++++++++++++++++-------------------------
Minclude/kfr/cometa/ctti.hpp | 23+++++++++++++++++------
Minclude/kfr/cometa/function.hpp | 34+++++++++++++++++-----------------
Minclude/kfr/cometa/named_arg.hpp | 4++--
Ainclude/kfr/cometa/numeric.hpp | 194+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/cometa/range.hpp | 33++++++++++++++++++---------------
Minclude/kfr/cometa/result.hpp | 13+++++++------
Minclude/kfr/cometa/string.hpp | 8++++----
Dinclude/kfr/cpuid.hpp | 26--------------------------
Dinclude/kfr/cpuid/cpuid.hpp | 297-------------------------------------------------------------------------------
Dinclude/kfr/cpuid/cpuid_auto.hpp | 60------------------------------------------------------------
Dinclude/kfr/data/sincos.hpp | 192-------------------------------------------------------------------------------
Minclude/kfr/dft/cache.hpp | 3+++
Minclude/kfr/dft/convolution.hpp | 28+++++++++++++++-------------
Rinclude/kfr/data/bitrev.hpp -> include/kfr/dft/data/bitrev.hpp | 0
Ainclude/kfr/dft/data/sincos.hpp | 192+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/dft/fft.hpp | 49++++++++++++++++++++++++++++++-------------------
Minclude/kfr/dft/impl/bitrev.hpp | 45++++++++++++++++++++++++---------------------
Minclude/kfr/dft/impl/convolution-impl.cpp | 26++++++++++++++------------
Ainclude/kfr/dft/impl/dft-fft.hpp | 123+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/dft/impl/dft-impl.hpp | 1311+++++--------------------------------------------------------------------------
Minclude/kfr/dft/impl/dft-src.cpp | 24++++++++++++------------
Minclude/kfr/dft/impl/dft-templates.hpp | 18++++++------------
Ainclude/kfr/dft/impl/fft-impl-f32.cpp | 29+++++++++++++++++++++++++++++
Ainclude/kfr/dft/impl/fft-impl-f64.cpp | 29+++++++++++++++++++++++++++++
Ainclude/kfr/dft/impl/fft-impl.hpp | 1148+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/dft/impl/fft-templates.hpp | 50++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/dft/impl/ft.hpp | 462+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Minclude/kfr/dft/reference_dft.hpp | 8++++----
Minclude/kfr/dsp.hpp | 1-
Minclude/kfr/dsp/biquad.hpp | 128++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Minclude/kfr/dsp/biquad_design.hpp | 21++++++++++++---------
Minclude/kfr/dsp/dcremove.hpp | 7+++++--
Minclude/kfr/dsp/delay.hpp | 57++++++++++++++++++++++++++++++++-------------------------
Minclude/kfr/dsp/ebu.hpp | 58+++++++++++++++++++++++++++++++++++++++++++---------------
Minclude/kfr/dsp/fir.hpp | 57++++++++++++++++++++++++++++++++-------------------------
Minclude/kfr/dsp/fir_design.hpp | 67+++++++++++++++++++++++++++++++++++--------------------------------
Minclude/kfr/dsp/fracdelay.hpp | 8++++++--
Minclude/kfr/dsp/goertzel.hpp | 24++++++++++++++----------
Dinclude/kfr/dsp/interpolation.hpp | 72------------------------------------------------------------------------
Minclude/kfr/dsp/mixdown.hpp | 12++++++++----
Minclude/kfr/dsp/oscillators.hpp | 96+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Minclude/kfr/dsp/sample_rate_conversion.hpp | 140+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Minclude/kfr/dsp/speaker.hpp | 5++++-
Minclude/kfr/dsp/special.hpp | 20++++++++++++--------
Minclude/kfr/dsp/units.hpp | 55+++++++++++++++++++++++++++++--------------------------
Minclude/kfr/dsp/waveshaper.hpp | 20++++++++++++--------
Minclude/kfr/dsp/weighting.hpp | 34+++++++++++++++++++---------------
Minclude/kfr/dsp/window.hpp | 128++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Dinclude/kfr/ext/console_colors.hpp | 162-------------------------------------------------------------------------------
Dinclude/kfr/ext/double_double.hpp | 86-------------------------------------------------------------------------------
Minclude/kfr/io/audiofile.hpp | 54+++++++++++++++++++++++++++++-------------------------
Minclude/kfr/io/file.hpp | 27++++++++++++++++++++++++---
Minclude/kfr/io/impl/audiofile-impl.cpp | 4++++
Minclude/kfr/io/python_plot.hpp | 20++++++++++----------
Minclude/kfr/io/tostring.hpp | 53++++++++++++++++++++++++++++++++++++++++++++++++++---
Ainclude/kfr/kfr.h | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/math.hpp | 22+++++++++++++++++++++-
Ainclude/kfr/math/abs.hpp | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/asin_acos.hpp | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/atan.hpp | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/clamp.hpp | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/compiletime.hpp | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/complex_math.hpp | 410+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/gamma.hpp | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/hyperbolic.hpp | 123+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/abs.hpp | 138+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/asin_acos.hpp | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/atan.hpp | 230+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/clamp.hpp | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/gamma.hpp | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/hyperbolic.hpp | 99+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/log_exp.hpp | 335+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/logical.hpp | 278+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/min_max.hpp | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/modzerobessel.hpp | 104+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/round.hpp | 282+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/saturation.hpp | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/select.hpp | 329+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/sin_cos.hpp | 310+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/sqrt.hpp | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/impl/tan.hpp | 149+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/interpolation.hpp | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/log_exp.hpp | 232+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/logical.hpp | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/min_max.hpp | 111+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/modzerobessel.hpp | 47+++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/round.hpp | 163+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/saturation.hpp | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/select.hpp | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/sin_cos.hpp | 318+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/sqrt.hpp | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/math/tan.hpp | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/runtime.hpp | 26++++++++++++++++++++++++++
Ainclude/kfr/runtime/cpuid.hpp | 300+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/runtime/cpuid_auto.hpp | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd.hpp | 36++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/comparison.hpp | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/complex.hpp | 468+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/constants.hpp | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/digitreverse.hpp | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/horizontal.hpp | 138+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/backend.hpp | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/backend_clang.hpp | 228+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/backend_generic.hpp | 1080+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/basicoperators_clang.hpp | 178+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/basicoperators_generic.hpp | 1674+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/function.hpp | 295+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/intrinsics.h | 50++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/operators.hpp | 164+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/simd.hpp | 183+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/specializations.i | 116+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/mask.hpp | 155+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/operators.hpp | 810+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/platform.hpp | 286+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/read_write.hpp | 243+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/shuffle.hpp | 569+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/types.hpp | 372+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/vec.hpp | 1283+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/testo/assert.hpp | 5++++-
Minclude/kfr/testo/comparison.hpp | 38++++++++++++++++++++++++++++++++------
Ainclude/kfr/testo/console_colors.hpp | 166+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/testo/double_double.hpp | 170+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/testo/testo.hpp | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Minclude/kfr/version.hpp | 3+--
Msources.cmake | 172+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
Mtests/CMakeLists.txt | 121+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Mtests/all_tests.cpp | 20+++++++++++++++++++-
Atests/all_tests_merged.cpp | 25+++++++++++++++++++++++++
Atests/asm_test.cpp | 213+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtests/base_test.cpp | 346++-----------------------------------------------------------------------------
Mtests/complex_test.cpp | 18++++++++----------
Atests/data/acos_double_fuzz | 0
Atests/data/acos_double_narrow | 0
Atests/data/acos_float_fuzz | 0
Atests/data/acos_float_narrow | 0
Atests/data/asin_double_fuzz | 0
Atests/data/asin_double_narrow | 0
Atests/data/asin_float_fuzz | 0
Atests/data/asin_float_narrow | 0
Atests/data/atan2_double_fuzz | 0
Atests/data/atan2_double_narrow | 0
Atests/data/atan2_float_fuzz | 0
Atests/data/atan2_float_narrow | 0
Atests/data/atan_double_fuzz | 0
Atests/data/atan_double_narrow | 0
Atests/data/atan_float_fuzz | 0
Atests/data/atan_float_narrow | 0
Atests/data/cbrt_double_fuzz | 0
Atests/data/cbrt_double_narrow | 0
Atests/data/cbrt_float_fuzz | 0
Atests/data/cbrt_float_narrow | 0
Atests/data/cos_double_fuzz | 0
Atests/data/cos_double_narrow | 0
Atests/data/cos_float_fuzz | 0
Atests/data/cos_float_narrow | 0
Atests/data/cosh_double_fuzz | 0
Atests/data/cosh_double_narrow | 0
Atests/data/cosh_float_fuzz | 0
Atests/data/cosh_float_narrow | 0
Atests/data/coth_double_fuzz | 0
Atests/data/coth_double_narrow | 0
Atests/data/coth_float_fuzz | 0
Atests/data/coth_float_narrow | 0
Atests/data/exp10_double_fuzz | 0
Atests/data/exp10_double_narrow | 0
Atests/data/exp10_float_fuzz | 0
Atests/data/exp10_float_narrow | 0
Atests/data/exp2_double_fuzz | 0
Atests/data/exp2_double_narrow | 0
Atests/data/exp2_float_fuzz | 0
Atests/data/exp2_float_narrow | 0
Atests/data/exp_double_fuzz | 0
Atests/data/exp_double_narrow | 0
Atests/data/exp_float_fuzz | 0
Atests/data/exp_float_narrow | 0
Atests/data/gamma_double_fuzz | 0
Atests/data/gamma_double_narrow | 0
Atests/data/gamma_float_fuzz | 0
Atests/data/gamma_float_narrow | 0
Atests/data/log10_double_fuzz | 0
Atests/data/log10_double_narrow | 0
Atests/data/log10_float_fuzz | 0
Atests/data/log10_float_narrow | 0
Atests/data/log2_double_fuzz | 0
Atests/data/log2_double_narrow | 0
Atests/data/log2_float_fuzz | 0
Atests/data/log2_float_narrow | 0
Atests/data/log_double_fuzz | 0
Atests/data/log_double_narrow | 0
Atests/data/log_float_fuzz | 0
Atests/data/log_float_narrow | 0
Atests/data/sin_double_fuzz | 0
Atests/data/sin_double_narrow | 0
Atests/data/sin_float_fuzz | 0
Atests/data/sin_float_narrow | 0
Atests/data/sinh_double_fuzz | 0
Atests/data/sinh_double_narrow | 0
Atests/data/sinh_float_fuzz | 0
Atests/data/sinh_float_narrow | 0
Atests/data/tan_double_fuzz | 0
Atests/data/tan_double_narrow | 0
Atests/data/tan_float_fuzz | 0
Atests/data/tan_float_narrow | 0
Atests/data/tanh_double_fuzz | 0
Atests/data/tanh_double_narrow | 0
Atests/data/tanh_float_fuzz | 0
Atests/data/tanh_float_narrow | 0
Mtests/dft_test.cpp | 111+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mtests/dsp_test.cpp | 208++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Dtests/ebu_test.cpp | 122-------------------------------------------------------------------------------
Dtests/empty_test.cpp | 5-----
Mtests/expression_test.cpp | 31+++++++++++++++++++++++--------
Atests/generate_data.cpp | 114+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtests/intrinsic_test.cpp | 290++++++++-----------------------------------------------------------------------
Mtests/io_test.cpp | 15+++++++++------
Mtests/mpfr/mpfrplus.hpp | 25++++++++++++++++---------
Mtests/multiarch.cpp | 1-
Atests/numeric_tests.hpp | 123+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dtests/resampler_test.cpp | 37-------------------------------------
Dtests/transcendental_test.cpp | 172-------------------------------------------------------------------------------
Atests/unit/base/conversion.cpp | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/base/reduce.cpp | 41+++++++++++++++++++++++++++++++++++++++++
Atests/unit/math/abs.cpp | 13+++++++++++++
Atests/unit/math/asin_acos.cpp | 18++++++++++++++++++
Atests/unit/math/atan.cpp | 18++++++++++++++++++
Atests/unit/math/hyperbolic.cpp | 21+++++++++++++++++++++
Atests/unit/math/log_exp.cpp | 23+++++++++++++++++++++++
Atests/unit/math/min_max.cpp | 39+++++++++++++++++++++++++++++++++++++++
Atests/unit/math/round.cpp | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/math/select.cpp | 27+++++++++++++++++++++++++++
Atests/unit/math/sin_cos.cpp | 17+++++++++++++++++
Atests/unit/math/tan.cpp | 16++++++++++++++++
Atests/unit/simd/complex.cpp | 33+++++++++++++++++++++++++++++++++
Atests/unit/simd/operators.cpp | 220+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/simd/shuffle.cpp | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atests/unit/simd/vec.cpp | 114+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/CMakeLists.txt | 28++++++++++++++++++++++++++++
Atools/ebu_test.cpp | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rexamples/sample_rate_converter.cpp -> tools/sample_rate_converter.cpp | 0
Mupdate-sources.py | 32++++++++++++++++++++------------
325 files changed, 21331 insertions(+), 15841 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -77,7 +77,7 @@ var/ venv/ -# Sphinx documentation +# Documentation docs/ mkdocs/ diff --git a/CHANGELOG.md b/CHANGELOG.md @@ -1,5 +1,26 @@ # Changelog +## 3.0.5 + +2019-02-21 + +#### Added + +- DFT speeds have been improved by up to 15% on most modern cpus +- Support for MSVC 2017 +- Support for GCC 7.3 +- Support for GCC 8.2 +- Support for resampling complex vectors (Thanks to https://github.com/ermito) +- Tests for various math functions no longer depend on MPFR + +#### Changed + +- Testo now allocates much less memory during long tests (x3 less than previously) + +#### Fixed + +- Building generators (Thanks to https://github.com/ermito) + ## 3.0.4 2019-01-08 @@ -9,6 +30,7 @@ #### Changed - KFR_READCYCLECOUNTER may be redefined to point to any function returning (pseudo-)random value +- Ability to disable random number initialization functions #### Fixed diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -15,15 +15,33 @@ # along with KFR. -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 3.1) + +message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS}" CACHE STRING "compile flags" FORCE) +message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}) + project(kfr CXX) -message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER} ") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) +message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER} ") message(STATUS CMAKE_SYSTEM_PROCESSOR = ${CMAKE_SYSTEM_PROCESSOR}) + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + set (X86 TRUE) +else () + set (X86 FALSE) +endif () + +if (X86) + message(STATUS X86) +endif () + if (MSVC) message(STATUS MSVC) endif() @@ -34,77 +52,128 @@ else() set(CLANG 0) endif() -# Include list of source files +# Include autogenerated list of source files include(sources.cmake) -add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE) +option(ENABLE_TESTS "Enable tests and examples" OFF) +if (CLANG) + option(ENABLE_DFT "Enable DFT and related algorithms. Requires Clang" ON) +endif () +option(ENABLE_ASMTEST "Enable writing disassembly" OFF) +option(REGENERATE_TESTS "Regenerate auto tests" OFF) +option(DISABLE_CLANG_EXTENSIONS "Disable Clang vector extensions" OFF) +option(KFR_EXTENDED_TESTS "Extended tests (up to hour)" OFF) +mark_as_advanced(ENABLE_ASMTEST) +mark_as_advanced(REGENERATE_TESTS) +mark_as_advanced(DISABLE_CLANG_EXTENSIONS) + +if (NOT CPU_ARCH) + set(CPU_ARCH avx2) +endif () -option(ENABLE_TESTS "Enable tests and examples. This changes many compiler flags" OFF) -option(ENABLE_DFT "Enable DFT and related algorithms" ON) +if (CPU_ARCH STREQUAL "detect") + message(STATUS "Detecting native cpu...") + try_run( + RUN_RESULT COMPILE_RESULT + "${CMAKE_BINARY_DIR}/tmpdir" + ${CMAKE_SOURCE_DIR}/cmake/detect_cpu.cpp + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_SOURCE_DIR}/include" + COMPILE_OUTPUT_VARIABLE COMPILE_OUT + RUN_OUTPUT_VARIABLE RUN_OUT + ) + if (COMPILE_RESULT AND RUN_RESULT EQUAL 0) + message(STATUS DETECTED_CPU = ${RUN_OUT}) + set(CPU_ARCH ${RUN_OUT}) + else() + message(STATUS COMPILE_RESULT = ${COMPILE_RESULT}) + message(STATUS RUN_RESULT = ${RUN_RESULT}) + message(STATUS COMPILE_OUT = ${COMPILE_OUT}) + message(STATUS RUN_OUT = ${RUN_OUT}) + endif () +endif () -set(KFR_DFT_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-src.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/dft_c.h - ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp) +include(cmake/target_set_arch.cmake) -set(KFR_IO_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp) +add_library(use_arch INTERFACE) +target_set_arch(use_arch INTERFACE ${CPU_ARCH}) -if (ENABLE_TESTS) +if (WIN32) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) + add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE) +endif() - if (IOS) - set(STD_LIB) - else () - set(STD_LIB stdc++) - endif () +if (IOS) + set(STD_LIB) +else () + set(STD_LIB stdc++) +endif () + +# KFR library +add_library(kfr INTERFACE) +target_sources(kfr INTERFACE ${KFR_SRC}) +target_include_directories(kfr INTERFACE include) +target_compile_options(kfr INTERFACE "$<$<CONFIG:DEBUG>:-DKFR_DEBUG>") +if (NOT MSVC) + target_compile_options(kfr INTERFACE -mstackrealign) +endif () +if (MSVC) + target_compile_options(kfr INTERFACE -bigobj) +else () + target_link_libraries(kfr INTERFACE ${STD_LIB} pthread m) +endif () +if (DISABLE_CLANG_EXTENSIONS) + target_compile_definitions(kfr INTERFACE -DCMT_DISABLE_CLANG_EXT) +endif () +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + target_compile_options(kfr INTERFACE -Wno-ignored-qualifiers) +endif () +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(kfr INTERFACE -Wno-c++1z-extensions) +endif () - # Binary output directories - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) +if (NOT ENABLE_DFT) + target_compile_definitions(kfr INTERFACE -DKFR_NO_DFT) +endif () +if (KFR_EXTENDED_TESTS) + target_compile_definitions(kfr INTERFACE -DKFR_EXTENDED_TESTS) +endif() - add_definitions(-D_CRT_SECURE_NO_WARNINGS) +message(STATUS CPU_ARCH=${CPU_ARCH}) - if (NOT MSVC OR CLANG) - # Enable C++14, disable exceptions and rtti - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - add_compile_options(-std=gnu++1y) - else () - add_compile_options(-std=c++1y) - endif () - add_compile_options(-fno-exceptions -fno-rtti ) - if (NOT ARCH_FLAGS) - add_compile_options(-march=native) - message(STATUS "Building for native cpu") - if(WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mstackrealign -fno-asynchronous-unwind-tables") - endif() - else () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_FLAGS}") - endif () - if(NOT MSVC) - link_libraries(${STD_LIB} pthread m) - endif() - else () - # Disable exceptions - add_compile_options(/EHsc /D_HAS_EXCEPTIONS=0 /D_CRT_SECURE_NO_WARNINGS=1) - add_compile_options(/arch:AVX) +if (ENABLE_TESTS) + + if (MSVC) + else() + # disable exceptions and rtti + add_compile_options(-fno-exceptions -fno-rtti -fno-asynchronous-unwind-tables) endif () add_subdirectory(examples) add_subdirectory(tests) + add_subdirectory(tools) endif () -add_library(kfr INTERFACE) -target_sources(kfr INTERFACE ${KFR_SRC}) -target_include_directories(kfr INTERFACE include) - if (ENABLE_DFT) + if (NOT CLANG) + message(FATAL_ERROR "Clang compiler is required for DFT in KFR. See README.md for more information") + endif() add_library(kfr_dft ${KFR_DFT_SRC}) - target_link_libraries(kfr_dft kfr) + target_link_libraries(kfr_dft kfr use_arch) + if (MSVC) + target_compile_options(kfr_dft PRIVATE -fp:fast) + else() + target_compile_options(kfr_dft PRIVATE -ffast-math) + endif() endif() add_library(kfr_io ${KFR_IO_SRC}) target_link_libraries(kfr_io kfr) target_compile_definitions(kfr_io PUBLIC KFR_ENABLE_FLAC=1) + +install(TARGETS kfr kfr_io ARCHIVE DESTINATION lib) + +if (ENABLE_DFT) + install(TARGETS kfr_dft ARCHIVE DESTINATION lib) +endif () + +install(DIRECTORY include/kfr DESTINATION include) diff --git a/azure-pipelines.yml b/azure-pipelines.yml @@ -6,7 +6,7 @@ jobs: - bash: | set -e sudo apt-get update && sudo apt-get install -y ninja-build libmpfr-dev - ci/run.sh build-release -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release - job: Linux_x86_64_Clang_Debug pool: @@ -15,7 +15,7 @@ jobs: - bash: | set -e sudo apt-get update && sudo apt-get install -y ninja-build libmpfr-dev - ci/run.sh build-debug -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug + ci/run.sh build-debug -DCPU_ARCH=detect -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug - job: Linux_ARM_Clang_Release pool: @@ -46,7 +46,7 @@ jobs: set -e /bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer" brew install ninja - ci/run.sh build-release -DCMAKE_BUILD_TYPE=Release + ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_BUILD_TYPE=Release - job: macOS_x86_64_Clang_Debug strategy: @@ -62,7 +62,7 @@ jobs: set -e /bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer" brew install ninja - ci/run.sh build-release -DCMAKE_BUILD_TYPE=Release + ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_BUILD_TYPE=Release - job: Windows_MSVC_x86_64_Clang_Release pool: @@ -73,7 +73,7 @@ jobs: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat" set PATH=%PATH:C:\tools\mingw64\bin;=% set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=% - ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release + ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release - job: Windows_MSVC_x86_Clang_Release pool: @@ -84,7 +84,7 @@ jobs: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars32.bat" set PATH=%PATH:C:\tools\mingw64\bin;=% set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=% - ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release + ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release - job: Windows_MSVC_x86_Clang_Debug pool: @@ -95,32 +95,34 @@ jobs: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars32.bat" set PATH=%PATH:C:\tools\mingw64\bin;=% set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=% - ci\run.cmd build-debug -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Debug + ci\run.cmd build-debug -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Debug - job: Windows_MinGW_x86_64_AVX512_Clang_Release pool: WIN-AVX512 steps: - script: | set PATH=C:\msys64\mingw64\bin;C:\msys64\usr\local\bin;C:\msys64\usr\bin;%PATH% - bash -c "ci/run.sh build-release -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Release" + bash -c "ci/run.sh build-release -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Release" - job: Windows_MinGW_x86_64_AVX512_Clang_Debug pool: WIN-AVX512 steps: - script: | set PATH=C:\msys64\mingw64\bin;C:\msys64\usr\local\bin;C:\msys64\usr\bin;%PATH% - bash -c "ci/run.sh build-debug -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Debug" - + bash -c "ci/run.sh build-debug -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Debug" + - job: Windows_MSVC_x86_64_AVX512_Clang_Release pool: WIN-AVX512 steps: - script: | call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" - ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DARCH_FLAGS="-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release + set CXXFLAGS=-m64 + ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DCPU_ARCH=avx512 -DCMAKE_BUILD_TYPE=Release - job: Windows_MSVC_x86_64_AVX512_Clang_Debug pool: WIN-AVX512 steps: - script: | call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" - ci\run.cmd build-debug -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DARCH_FLAGS="-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Debug + set CXXFLAGS=-m64 + ci\run.cmd build-debug -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DCPU_ARCH=avx512 -DCMAKE_BUILD_TYPE=Debug diff --git a/cmake/arm.cmake b/cmake/arm.cmake @@ -11,7 +11,9 @@ set (CMAKE_CXX_COMPILER_WORKS TRUE) set (CMAKE_C_COMPILER_WORKS TRUE) set (ARM_ROOT "/usr/arm-linux-gnueabihf/include") -set (GCC_VER 5.4.0) +if (NOT GCC_VER) + set (GCC_VER 5.4.0) +endif () set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/arm-linux-gnueabihf -isystem ${ARM_ROOT}") set (ARM_COMMON_FLAGS "-target arm-linux-gnueabihf -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static") diff --git a/cmake/detect_cpu.cpp b/cmake/detect_cpu.cpp @@ -0,0 +1,9 @@ +#include <kfr/runtime/cpuid.hpp> + +using namespace kfr; + +int main() +{ + cpu_t cpu = kfr::internal_generic::detect_cpu(); + printf("%s", cpu_name(cpu)); +} +\ No newline at end of file diff --git a/cmake/target_set_arch.cmake b/cmake/target_set_arch.cmake @@ -0,0 +1,56 @@ + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + + set(ARCH_FLAGS_GNU_generic -DCMT_FORCE_GENERIC_CPU) + set(ARCH_FLAGS_GNU_sse2 -msse2) + set(ARCH_FLAGS_GNU_sse3 -msse3) + set(ARCH_FLAGS_GNU_ssse3 -mssse3) + set(ARCH_FLAGS_GNU_sse41 -msse4.1) + set(ARCH_FLAGS_GNU_avx -msse4.1 -mavx) + set(ARCH_FLAGS_GNU_avx2 -msse4.1 -mavx2 -mfma) + set(ARCH_FLAGS_GNU_avx512 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl) + + if (CMAKE_SIZEOF_VOID_P EQUAL 8) + # SSE2 is part of x86_64 + set(ARCH_FLAG_MS_SSE2) + else() + set(ARCH_FLAG_MS_SSE2 -arch:SSE2) + endif() + + set(ARCH_FLAGS_MS_generic ${ARCH_FLAG_MS_SSE2} -DCMT_FORCE_GENERIC_CPU) + set(ARCH_FLAGS_MS_sse2 ${ARCH_FLAG_MS_SSE2}) + set(ARCH_FLAGS_MS_sse3 ${ARCH_FLAG_MS_SSE2} -D__SSE3__) + set(ARCH_FLAGS_MS_ssse3 ${ARCH_FLAG_MS_SSE2} -D__SSSE3__) + set(ARCH_FLAGS_MS_sse41 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__) + set(ARCH_FLAGS_MS_avx -arch:AVX) + set(ARCH_FLAGS_MS_avx2 -arch:AVX2) + set(ARCH_FLAGS_MS_avx512 -arch:AVX512) + + function(target_set_arch TARGET MODE ARCH) + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") + set(CLANG 1) + else () + set(CLANG 0) + endif() + message(STATUS "target_set_arch(${TARGET} ${MODE} ${ARCH})") + if (CLANG OR NOT MSVC) + # Reset previous arch flags + if (CMAKE_SIZEOF_VOID_P EQUAL 8) + target_compile_options(${TARGET} ${MODE} -mno-sse3) + else() + target_compile_options(${TARGET} ${MODE} -mno-sse) + endif() + endif () + if (MSVC AND NOT CLANG) + target_compile_options(${TARGET} ${MODE} ${ARCH_FLAGS_MS_${ARCH}}) + else() + target_compile_options(${TARGET} ${MODE} ${ARCH_FLAGS_GNU_${ARCH}}) + endif () + endfunction() + +else() + + function(target_set_arch TARGET MODE ARCH) + endfunction() + +endif () diff --git a/cmake/test_toolset/CMakeLists.txt b/cmake/test_toolset/CMakeLists.txt @@ -1,3 +0,0 @@ -cmake_minimum_required(VERSION 3.0) - -project(test_toolset CXX) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt @@ -15,33 +15,32 @@ # along with KFR. -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 3.1) -file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/svg) +# Binary output directories +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) -include_directories(../include) +file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/svg) add_executable(biquads biquads.cpp) -target_link_libraries(biquads kfr) +target_link_libraries(biquads kfr use_arch) add_executable(window window.cpp) -target_link_libraries(window kfr) +target_link_libraries(window kfr use_arch) add_executable(fir fir.cpp) +target_link_libraries(fir kfr use_arch) -target_link_libraries(fir kfr) if (ENABLE_DFT) - target_link_libraries(fir kfr_dft) + target_link_libraries(fir kfr_dft use_arch) target_compile_definitions(fir PRIVATE -DHAVE_DFT) endif () add_executable(sample_rate_conversion sample_rate_conversion.cpp) -target_link_libraries(sample_rate_conversion kfr kfr_io) - -add_executable(sample_rate_converter sample_rate_converter.cpp) -target_link_libraries(sample_rate_converter kfr kfr_io) +target_link_libraries(sample_rate_conversion kfr kfr_io use_arch) if (ENABLE_DFT) add_executable(dft dft.cpp) - target_link_libraries(dft kfr kfr_dft) + target_link_libraries(dft kfr kfr_dft use_arch) endif () diff --git a/examples/biquads.cpp b/examples/biquads.cpp @@ -94,5 +94,7 @@ int main() plot_save("biquad_filter_lowpass", output, options + ", title='Biquad Low pass filter (0.2, 0.9) (using biquad_filter)'"); + println("SVG plots have been saved to svg directory"); + return 0; } diff --git a/examples/fir.cpp b/examples/fir.cpp @@ -148,5 +148,7 @@ int main() #endif #endif + println("SVG plots have been saved to svg directory"); + return 0; } diff --git a/examples/sample_rate_conversion.cpp b/examples/sample_rate_conversion.cpp @@ -72,5 +72,7 @@ int main() plot_save("audio_draft_quality", "audio_draft_quality.wav", ""); } + println("SVG plots have been saved to svg directory"); + return 0; } diff --git a/examples/window.cpp b/examples/window.cpp @@ -57,5 +57,7 @@ int main() output = window_kaiser(output.size(), 2.5); plot_save("window_kaiser", output, options + ", title='Kaiser window'"); + println("SVG plots have been saved to svg directory"); + return 0; } diff --git a/include/kfr/all.hpp b/include/kfr/all.hpp @@ -22,7 +22,6 @@ */ #include "base.hpp" -#include "cpuid.hpp" #include "dft.hpp" #include "dsp.hpp" #include "io.hpp" diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp @@ -22,44 +22,19 @@ */ #pragma once -#include "base/abs.hpp" -#include "base/asin_acos.hpp" -#include "base/atan.hpp" +#include "math.hpp" + #include "base/basic_expressions.hpp" -#include "base/clamp.hpp" -#include "base/comparison.hpp" -#include "base/compiletime.hpp" -#include "base/complex.hpp" -#include "base/constants.hpp" #include "base/conversion.hpp" -#include "base/digitreverse.hpp" #include "base/expression.hpp" #include "base/filter.hpp" -#include "base/function.hpp" -#include "base/gamma.hpp" +#include "base/fraction.hpp" +#include "base/function_expressions.hpp" #include "base/generators.hpp" -#include "base/horizontal.hpp" -#include "base/hyperbolic.hpp" -#include "base/log_exp.hpp" -#include "base/logical.hpp" #include "base/memory.hpp" -#include "base/min_max.hpp" -#include "base/modzerobessel.hpp" -#include "base/operators.hpp" #include "base/pointer.hpp" #include "base/random.hpp" -#include "base/read_write.hpp" #include "base/reduce.hpp" -#include "base/round.hpp" -#include "base/saturation.hpp" -#include "base/select.hpp" -#include "base/shuffle.hpp" -#include "base/sin_cos.hpp" #include "base/small_buffer.hpp" #include "base/sort.hpp" -#include "base/sqrt.hpp" -#include "base/tan.hpp" -#include "base/types.hpp" #include "base/univector.hpp" -#include "base/vec.hpp" -#include "version.hpp" diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp @@ -1,49 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/abs.hpp" - -namespace kfr -{ -/** - * @brief Returns the absolute value of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 abs(const T1& x) -{ - return intrinsics::abs(x); -} - -/** - * @brief Returns template expression that returns the absolute value of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::abs, E1> abs(E1&& x) -{ - return { fn::abs(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp @@ -1,67 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/asin_acos.hpp" - -namespace kfr -{ - -/** - * @brief Returns the arc sine of x. The returned angle is in the range \f$-\pi/2\f$ through \f$\pi/2\f$. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> asin(const T1& x) -{ - return intrinsics::asin(x); -} - -/** - * @brief Returns template expression that returns the arc sine of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::asin, E1> asin(E1&& x) -{ - return { fn::asin(), std::forward<E1>(x) }; -} -/** - * @brief Returns the arc cosine of x. The returned angle is in the range 0 through \f$\pi\f$. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> acos(const T1& x) -{ - return intrinsics::acos(x); -} - -/** - * @brief Returns template expression that returns the arc cosine of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::acos, E1> acos(E1&& x) -{ - return { fn::acos(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp @@ -1,107 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/atan.hpp" - -namespace kfr -{ - -/** - * @brief Returns the arc tangent of x. The returned angle is in the range \f$-\pi/2\f$ through - * \f$\pi/2\f$. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> atan(const T1& x) -{ - return intrinsics::atan(x); -} - -/** - * @brief Returns template expression that returns the arc tangent of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::atan, E1> atan(E1&& x) -{ - return { fn::atan(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the arc tangent of the x, expressed in degrees. The returned angle is in the range -90 - * through 90. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> atandeg(const T1& x) -{ - return intrinsics::atandeg(x); -} - -/** - * @brief Returns template expression that returns the arc tangent of the x, expressed in degrees. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::atandeg, E1> atandeg(E1&& x) -{ - return { fn::atandeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the arc tangent of y/x using the signs of arguments to determine the correct quadrant. - */ -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> -KFR_FUNC common_type<T1, T2> atan2(const T1& x, const T2& y) -{ - return intrinsics::atan2(x, y); -} - -/** - * @brief Returns template expression that returns the arc tangent of y/x. - */ -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_FUNC internal::expression_function<fn::atan2, E1, E2> atan2(E1&& x, E2&& y) -{ - return { fn::atan2(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/** - * @brief Returns the arc tangent of y/x (expressed in degrees) using the signs of arguments to determine the - * correct quadrant. - */ -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> -KFR_FUNC common_type<T1, T2> atan2deg(const T1& x, const T2& y) -{ - return intrinsics::atan2deg(x, y); -} - -/** - * @brief Returns template expression that returns the arc tangent of y/x (expressed in degrees). - */ -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_FUNC internal::expression_function<fn::atan2deg, E1, E2> atan2deg(E1&& x, E2&& y) -{ - return { fn::atan2deg(), std::forward<E1>(x), std::forward<E2>(y) }; -} -} // namespace kfr diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp @@ -25,27 +25,51 @@ */ #pragma once -#include "operators.hpp" +#include "../simd/operators.hpp" +#include "../simd/vec.hpp" #include "univector.hpp" -#include "vec.hpp" #include <algorithm> namespace kfr { +inline namespace CMT_ARCH_NAME +{ + +namespace internal +{ +template <size_t width, typename Fn> +KFR_INTRINSIC void block_process_impl(size_t& i, size_t size, Fn&& fn) +{ + CMT_LOOP_NOUNROLL + for (; i < size / width * width; i += width) + fn(i, csize_t<width>()); +} +} // namespace internal + +template <size_t... widths, typename Fn> +KFR_INTRINSIC void block_process(size_t size, csizes_t<widths...>, Fn&& fn) +{ + size_t i = 0; + swallow{ (internal::block_process_impl<widths>(i, size, std::forward<Fn>(fn)), 0)... }; +} namespace internal { template <typename To, typename E> -struct expression_convert : expression_base<E> +struct expression_convert : expression_with_arguments<E> { using value_type = To; - CMT_INLINE expression_convert(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {} + KFR_MEM_INTRINSIC expression_convert(E&& expr) CMT_NOEXCEPT + : expression_with_arguments<E>(std::forward<E>(expr)) + { + } template <size_t N> - CMT_INLINE vec<To, N> operator()(cinput_t input, size_t index, vec_t<To, N>) const + friend KFR_INTRINSIC vec<To, N> get_elements(const expression_convert& self, cinput_t input, + size_t index, vec_shape<To, N>) { - return this->argument_first(input, index, vec_t<To, N>()); + return self.argument_first(input, index, vec_shape<To, N>()); } }; @@ -56,7 +80,7 @@ struct expression_iterator struct iterator { T operator*() const { return get(); } - T get() const { return expr.e1(cinput, position, vec_t<T, 1>())[0]; } + T get() const { return get_elements(expr.e1, cinput, position, vec_shape<T, 1>()).front(); } iterator& operator++() { ++position; @@ -79,13 +103,13 @@ struct expression_iterator } // namespace internal template <typename To, typename E> -CMT_INLINE internal::expression_convert<To, E> convert(E&& expr) +KFR_INTRINSIC internal::expression_convert<To, E> convert(E&& expr) { return internal::expression_convert<To, E>(std::forward<E>(expr)); } template <typename E1, typename T = value_type_of<E1>> -CMT_INLINE internal::expression_iterator<T, E1> to_iterator(E1&& e1) +KFR_INTRINSIC internal::expression_iterator<T, E1> to_iterator(E1&& e1) { return internal::expression_iterator<T, E1>(std::forward<E1>(e1)); } @@ -99,30 +123,30 @@ inline auto sequence(const Ts&... list) } template <typename T = int> -CMT_INLINE auto zeros() +KFR_INTRINSIC auto zeros() { return lambda<T>([](cinput_t, size_t, auto x) { return zerovector(x); }); } template <typename T = int> -CMT_INLINE auto ones() +KFR_INTRINSIC auto ones() { - return lambda<T>([](cinput_t, size_t, auto x) { return 1; }); + return lambda<T>([](cinput_t, size_t, auto) { return 1; }); } template <typename T = int> -CMT_INLINE auto counter() +KFR_INTRINSIC auto counter() { return lambda<T>([](cinput_t, size_t index, auto x) { return enumerate(x) + index; }); } template <typename T1> -CMT_INLINE auto counter(T1 start) +KFR_INTRINSIC auto counter(T1 start) { return lambda<T1>([start](cinput_t, size_t index, auto x) { return enumerate(x) + index + start; }); } template <typename T1, typename T2> -CMT_INLINE auto counter(T1 start, T2 step) +KFR_INTRINSIC auto counter(T1 start, T2 step) { return lambda<common_type<T1, T2>>( [start, step](cinput_t, size_t index, auto x) { return (enumerate(x) + index) * step + start; }); @@ -149,10 +173,10 @@ namespace internal template <typename T, typename E1> struct expression_reader { - constexpr expression_reader(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {} + constexpr expression_reader(E1&& e1) CMT_NOEXCEPT : e1(std::forward<E1>(e1)) {} T read() const { - const T result = e1(cinput, m_position, vec_t<T, 1>()); + const T result = get_elements(e1, cinput, m_position, vec_shape<T, 1>()); m_position++; return result; } @@ -162,7 +186,7 @@ struct expression_reader template <typename T, typename E1> struct expression_writer { - constexpr expression_writer(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {} + constexpr expression_writer(E1&& e1) CMT_NOEXCEPT : e1(std::forward<E1>(e1)) {} template <typename U> void write(U value) { @@ -192,19 +216,20 @@ namespace internal { template <typename E1> -struct expression_slice : expression_base<E1> +struct expression_slice : expression_with_arguments<E1> { using value_type = value_type_of<E1>; using T = value_type; expression_slice(E1&& e1, size_t start, size_t size) - : expression_base<E1>(std::forward<E1>(e1)), start(start), + : expression_with_arguments<E1>(std::forward<E1>(e1)), start(start), new_size(size_min(size, size_sub(std::get<0>(this->args).size(), start))) { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_slice& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return this->argument_first(cinput, index + start, y); + return self.argument_first(cinput, index + self.start, y); } size_t size() const { return new_size; } size_t start; @@ -212,15 +237,16 @@ struct expression_slice : expression_base<E1> }; template <typename E1> -struct expression_reverse : expression_base<E1> +struct expression_reverse : expression_with_arguments<E1> { using value_type = value_type_of<E1>; using T = value_type; - expression_reverse(E1&& e1) : expression_base<E1>(std::forward<E1>(e1)), expr_size(e1.size()) {} + expression_reverse(E1&& e1) : expression_with_arguments<E1>(std::forward<E1>(e1)), expr_size(e1.size()) {} template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_reverse& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return reverse(this->argument_first(cinput, expr_size - index - N, y)); + return reverse(self.argument_first(cinput, self.expr_size - index - N, y)); } size_t size() const { return expr_size; } size_t expr_size; @@ -234,7 +260,7 @@ struct expression_linspace<T, false> : input_expression { using value_type = T; - CMT_INLINE constexpr size_t size() const noexcept { return truncate_size; } + KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT { return truncate_size; } expression_linspace(T start, T stop, size_t size, bool endpoint = false, bool truncate = false) : start(start), offset((stop - start) / T(endpoint ? size - 1 : size)), @@ -248,10 +274,11 @@ struct expression_linspace<T, false> : input_expression } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_linspace& self, cinput_t, size_t index, + vec_shape<T, N> x) { using TI = itype<T>; - return T(start) + (enumerate(x) + cast<T>(cast<TI>(index))) * T(offset); + return T(self.start) + (enumerate(x) + static_cast<T>(static_cast<TI>(index))) * T(self.offset); } T start; @@ -264,7 +291,7 @@ struct expression_linspace<T, true> : input_expression { using value_type = T; - CMT_INLINE constexpr size_t size() const noexcept { return truncate_size; } + KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT { return truncate_size; } expression_linspace(T start, T stop, size_t size, bool endpoint = false, bool truncate = false) : start(start), stop(stop), invsize(1.0 / T(endpoint ? size - 1 : size)), @@ -278,13 +305,15 @@ struct expression_linspace<T, true> : input_expression } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_linspace& self, cinput_t, size_t index, + vec_shape<T, N> x) { using TI = itype<T>; - return mix((enumerate(x) + cast<T>(cast<TI>(index))) * invsize, cast<T>(start), cast<T>(stop)); + return mix((enumerate(x) + static_cast<T>(static_cast<TI>(index))) * self.invsize, self.start, + self.stop); } template <typename U, size_t N> - CMT_INLINE static vec<U, N> mix(const vec<U, N>& t, U x, U y) + KFR_MEM_INTRINSIC static vec<U, N> mix(const vec<U, N>& t, U x, U y) { return (U(1.0) - t) * x + t * y; } @@ -296,16 +325,16 @@ struct expression_linspace<T, true> : input_expression }; template <typename... E> -struct expression_sequence : expression_base<E...> +struct expression_sequence : expression_with_arguments<E...> { public: - using base = expression_base<E...>; + using base = expression_with_arguments<E...>; using value_type = common_type<value_type_of<E>...>; using T = value_type; template <typename... Expr_> - CMT_INLINE expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) noexcept + KFR_MEM_INTRINSIC expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) CMT_NOEXCEPT : base(std::forward<Expr_>(expr)...) { std::copy(std::begin(segments), std::end(segments), this->segments.begin() + 1); @@ -314,20 +343,22 @@ public: } template <size_t N> - CMT_NOINLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_sequence& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - std::size_t sindex = size_t(std::upper_bound(std::begin(segments), std::end(segments), index) - 1 - - std::begin(segments)); - if (segments[sindex + 1] - index >= N) - return get(cinput, index, sindex - 1, y); + std::size_t sindex = + size_t(std::upper_bound(std::begin(self.segments), std::end(self.segments), index) - 1 - + std::begin(self.segments)); + if (self.segments[sindex + 1] - index >= N) + return get_elements(self, cinput, index, sindex - 1, y); else { vec<T, N> result; CMT_PRAGMA_CLANG(clang loop unroll_count(4)) for (size_t i = 0; i < N; i++) { - sindex = segments[sindex + 1] == index ? sindex + 1 : sindex; - result.data()[i] = get(cinput, index, sindex - 1, vec_t<T, 1>())[0]; + sindex = self.segments[sindex + 1] == index ? sindex + 1 : sindex; + result.data()[i] = get_elements(self, cinput, index, sindex - 1, vec_shape<T, 1>()).front(); index++; } return result; @@ -336,10 +367,11 @@ public: protected: template <size_t N> - CMT_NOINLINE vec<T, N> get(cinput_t cinput, size_t index, size_t expr_index, vec_t<T, N> y) + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_sequence& self, cinput_t cinput, + size_t index, size_t expr_index, vec_shape<T, N> y) { return cswitch(indicesfor_t<E...>(), expr_index, - [&](auto val) { return this->argument(cinput, val, index, y); }, + [&](auto val) { return self.argument(cinput, val, index, y); }, [&]() { return zerovector(y); }); } @@ -347,20 +379,24 @@ protected: }; template <typename Fn, typename E> -struct expression_adjacent : expression_base<E> +struct expression_adjacent : expression_with_arguments<E> { using value_type = value_type_of<E>; using T = value_type; - expression_adjacent(Fn&& fn, E&& e) : expression_base<E>(std::forward<E>(e)), fn(std::forward<Fn>(fn)) {} + expression_adjacent(Fn&& fn, E&& e) + : expression_with_arguments<E>(std::forward<E>(e)), fn(std::forward<Fn>(fn)) + { + } template <size_t N> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_adjacent& self, cinput_t cinput, + size_t index, vec_shape<T, N>) { - const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>()); - const vec<T, N> delayed = insertleft(data, in); - data = in[N - 1]; - return this->fn(in, delayed); + const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>()); + const vec<T, N> delayed = insertleft(self.data, in); + self.data = in[N - 1]; + return self.fn(in, delayed); } Fn fn; mutable value_type data = value_type(0); @@ -370,7 +406,7 @@ struct expression_adjacent : expression_base<E> /** @brief Returns the subrange of the given expression */ template <typename E1> -CMT_INLINE internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t size = infinite_size) +KFR_INTRINSIC internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t size = infinite_size) { return internal::expression_slice<E1>(std::forward<E1>(e1), start, size); } @@ -378,15 +414,15 @@ CMT_INLINE internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t si /** @brief Returns the expression truncated to the given size */ template <typename E1> -CMT_INLINE internal::expression_slice<E1> truncate(E1&& e1, size_t size) +KFR_INTRINSIC internal::expression_slice<E1> truncate(E1&& e1, size_t size) { return internal::expression_slice<E1>(std::forward<E1>(e1), 0, size); } -/** @brief Returns reversed expression +/** @brief Returns the reversed expression */ template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_reverse<E1> reverse(E1&& e1) +KFR_INTRINSIC internal::expression_reverse<E1> reverse(E1&& e1) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return internal::expression_reverse<E1>(std::forward<E1>(e1)); @@ -401,23 +437,24 @@ CMT_INLINE internal::expression_reverse<E1> reverse(E1&& e1) * @param truncate If ``true``, linspace returns exactly size elements, otherwise, returns infinite sequence */ template <typename T1, typename T2, bool precise = false, typename TF = ftype<common_type<T1, T2>>> -CMT_INLINE internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size, - bool endpoint = false, bool truncate = false) +KFR_INTRINSIC internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size, + bool endpoint = false, + bool truncate = false) { return internal::expression_linspace<TF, precise>(start, stop, size, endpoint, truncate); } KFR_FN(linspace) template <typename T, bool precise = false, typename TF = ftype<T>> -CMT_INLINE internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size, - bool endpoint = false) +KFR_INTRINSIC internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size, + bool endpoint = false) { return internal::expression_linspace<TF, precise>(symmetric_linspace, symsize, size, endpoint); } KFR_FN(symmlinspace) template <size_t size, typename... E> -CMT_INLINE internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens) +KFR_INTRINSIC internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens) { static_assert(size == sizeof...(E), "Lists must be of equal length"); return internal::expression_sequence<decay<E>...>(list, std::forward<E>(gens)...); @@ -428,7 +465,7 @@ KFR_FN(gen_sequence) * @brief Returns template expression that returns the result of calling \f$ fn(x_i, x_{i-1}) \f$ */ template <typename Fn, typename E1> -CMT_INLINE internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1) +KFR_INTRINSIC internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1) { return internal::expression_adjacent<Fn, E1>(std::forward<Fn>(fn), std::forward<E1>(e1)); } @@ -436,37 +473,38 @@ CMT_INLINE internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1) namespace internal { template <typename E> -struct expression_padded : expression_base<E> +struct expression_padded : expression_with_arguments<E> { using value_type = value_type_of<E>; - CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; } + KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; } expression_padded(value_type fill_value, E&& e) - : expression_base<E>(std::forward<E>(e)), fill_value(fill_value), input_size(e.size()) + : expression_with_arguments<E>(std::forward<E>(e)), fill_value(fill_value), input_size(e.size()) { } template <size_t N> - vec<value_type, N> operator()(cinput_t cinput, size_t index, vec_t<value_type, N> y) const + KFR_INTRINSIC friend vec<value_type, N> get_elements(const expression_padded& self, cinput_t cinput, + size_t index, vec_shape<value_type, N> y) { - if (index >= input_size) + if (index >= self.input_size) { - return fill_value; + return self.fill_value; } - else if (index + N <= input_size) + else if (index + N <= self.input_size) { - return this->argument_first(cinput, index, y); + return self.argument_first(cinput, index, y); } else { - vec<value_type, N> x; + vec<value_type, N> x{}; for (size_t i = 0; i < N; i++) { - if (index + i < input_size) - x[i] = this->argument_first(cinput, index + i, vec_t<value_type, 1>())[0]; + if (index + i < self.input_size) + x[i] = self.argument_first(cinput, index + i, vec_shape<value_type, 1>()).front(); else - x[i] = fill_value; + x[i] = self.fill_value; } return x; } @@ -507,44 +545,45 @@ private: }; template <typename... E> -struct expression_pack : expression_base<E...> +struct expression_pack : expression_with_arguments<E...> { constexpr static size_t count = sizeof...(E); - expression_pack(E&&... e) : expression_base<E...>(std::forward<E>(e)...) {} + expression_pack(E&&... e) : expression_with_arguments<E...>(std::forward<E>(e)...) {} using value_type = vec<common_type<value_type_of<E>...>, count>; using T = value_type; - using expression_base<E...>::size; + using expression_with_arguments<E...>::size; template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pack& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return this->call(cinput, fn::packtranspose(), index, y); + return self.call(cinput, fn::packtranspose(), index, y); } }; template <typename... E> -struct expression_unpack : private expression_base<E...>, output_expression +struct expression_unpack : private expression_with_arguments<E...>, output_expression { - using expression_base<E...>::begin_block; - using expression_base<E...>::end_block; + using expression_with_arguments<E...>::begin_block; + using expression_with_arguments<E...>::end_block; using output_expression::begin_block; using output_expression::end_block; constexpr static size_t count = sizeof...(E); - expression_unpack(E&&... e) : expression_base<E...>(std::forward<E>(e)...) {} + expression_unpack(E&&... e) : expression_with_arguments<E...>(std::forward<E>(e)...) {} - using expression_base<E...>::size; + using expression_with_arguments<E...>::size; template <typename U, size_t N> - CMT_INLINE void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x) + KFR_MEM_INTRINSIC void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x) { - output(coutput, index, x, csizeseq_t<count>()); + output(coutput, index, x, csizeseq<count>); } template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)> - CMT_INLINE expression_unpack& operator=(Input&& input) + KFR_MEM_INTRINSIC expression_unpack& operator=(Input&& input) { process(*this, std::forward<Input>(input)); return *this; @@ -554,7 +593,7 @@ private: template <typename U, size_t N, size_t... indices> void output(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x, csizes_t<indices...>) { - const vec<vec<U, N>, count> xx = compcast<vec<U, N>>(transpose<count>(flatten(x))); + const vec<vec<U, N>, count> xx = vec<vec<U, N>, count>::from_flatten(transpose<count>(flatten(x))); swallow{ (std::get<indices>(this->args)(coutput, index, xx[indices]), void(), 0)... }; } }; @@ -600,12 +639,13 @@ task_partition<OutExpr, InExpr> partition(OutExpr&& output, InExpr&& input, size { static_assert(!is_infinite<OutExpr>::value || !is_infinite<InExpr>::value, ""); - minimum_size = minimum_size == 0 ? platform<T>::vector_width * 8 : minimum_size; + minimum_size = minimum_size == 0 ? vector_width<T> * 8 : minimum_size; const size_t size = size_min(output.size(), input.size()); - const size_t chunk_size = align_up(std::max(size / count, minimum_size), platform<T>::vector_width); + const size_t chunk_size = align_up(std::max(size / count, minimum_size), vector_width<T>); task_partition<OutExpr, InExpr> result(std::forward<OutExpr>(output), std::forward<InExpr>(input), size, chunk_size, (size + chunk_size - 1) / chunk_size); return result; } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/bitwise.hpp b/include/kfr/base/bitwise.hpp @@ -1,136 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "constants.hpp" -#include "vec.hpp" - -namespace kfr -{ - -CMT_INLINE float bitwisenot(float x) { return fbitcast(~ubitcast(x)); } -CMT_INLINE float bitwiseor(float x, float y) { return fbitcast(ubitcast(x) | ubitcast(y)); } -CMT_INLINE float bitwiseand(float x, float y) { return fbitcast(ubitcast(x) & ubitcast(y)); } -CMT_INLINE float bitwiseandnot(float x, float y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); } -CMT_INLINE float bitwisexor(float x, float y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); } -CMT_INLINE double bitwisenot(double x) { return fbitcast(~ubitcast(x)); } -CMT_INLINE double bitwiseor(double x, double y) { return fbitcast(ubitcast(x) | ubitcast(y)); } -CMT_INLINE double bitwiseand(double x, double y) { return fbitcast(ubitcast(x) & ubitcast(y)); } -CMT_INLINE double bitwiseandnot(double x, double y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); } -CMT_INLINE double bitwisexor(double x, double y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); } - -/// @brief Bitwise Not -template <typename T1> -CMT_INLINE T1 bitwisenot(const T1& x) -{ - return ~x; -} -KFR_FN(bitwisenot) - -/// @brief Bitwise And -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> bitwiseand(const T1& x, const T2& y) -{ - return x & y; -} -template <typename T> -constexpr CMT_INLINE T bitwiseand(initialvalue<T>) -{ - return constants<T>::allones(); -} -KFR_FN(bitwiseand) - -/// @brief Bitwise And-Not -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y) -{ - return x & ~y; -} -template <typename T> -constexpr inline T bitwiseandnot(initialvalue<T>) -{ - return constants<T>::allones(); -} -KFR_FN(bitwiseandnot) - -/// @brief Bitwise Or -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> bitwiseor(const T1& x, const T2& y) -{ - return x | y; -} -template <typename T> -constexpr CMT_INLINE T bitwiseor(initialvalue<T>) -{ - return subtype<T>(0); -} -KFR_FN(bitwiseor) - -/// @brief Bitwise Xor (Exclusive Or) -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> bitwisexor(const T1& x, const T2& y) -{ - return x ^ y; -} -template <typename T> -constexpr CMT_INLINE T bitwisexor(initialvalue<T>) -{ - return subtype<T>(); -} -KFR_FN(bitwisexor) - -/// @brief Bitwise Left shift -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> shl(const T1& left, const T2& right) -{ - return left << right; -} -KFR_FN(shl) - -/// @brief Bitwise Right shift -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> shr(const T1& left, const T2& right) -{ - return left >> right; -} -KFR_FN(shr) - -/// @brief Bitwise Left Rotate -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> rol(const T1& left, const T2& right) -{ - return shl(left, right) | shr(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right)); -} -KFR_FN(rol) - -/// @brief Bitwise Right Rotate -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> ror(const T1& left, const T2& right) -{ - return shr(left, right) | shl(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right)); -} -KFR_FN(ror) -} // namespace kfr diff --git a/include/kfr/base/clamp.hpp b/include/kfr/base/clamp.hpp @@ -1,62 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/clamp.hpp" - -namespace kfr -{ - -/// @brief Returns the first argument clamped to a range [lo, hi] -template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value), - typename Tout = common_type<T1, T2, T3>> -KFR_INTRIN Tout clamp(const T1& x, const T2& lo, const T3& hi) -{ - return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi)); -} - -/// @brief Creates an expression that returns the first argument clamped to a range [lo, hi] -template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> -KFR_INTRIN internal::expression_function<fn::clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi) -{ - return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) }; -} - -/// @brief Returns the first argument clamped to a range [0, hi] -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout clamp(const T1& x, const T2& hi) -{ - return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(hi)); -} - -/// @brief Creates an expression that returns the first argument clamped to a range [0, hi] -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::clamp, E1, E2> clamp(E1&& x, E2&& hi) -{ - return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(hi) }; -} -} // namespace kfr diff --git a/include/kfr/base/comparison.hpp b/include/kfr/base/comparison.hpp @@ -1,149 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "constants.hpp" -#include "expression.hpp" -#include "vec.hpp" - -namespace kfr -{ - -template <typename T1, typename T2> -inline maskfor<common_type<T1, T2>> equal(const T1& x, const T2& y) -{ - return x == y; -} -template <typename T1, typename T2> -inline maskfor<common_type<T1, T2>> notequal(const T1& x, const T2& y) -{ - return x != y; -} -template <typename T1, typename T2> -inline maskfor<common_type<T1, T2>> less(const T1& x, const T2& y) -{ - return x < y; -} -template <typename T1, typename T2> -inline maskfor<common_type<T1, T2>> greater(const T1& x, const T2& y) -{ - return x > y; -} -template <typename T1, typename T2> -inline maskfor<common_type<T1, T2>> lessorequal(const T1& x, const T2& y) -{ - return x <= y; -} -template <typename T1, typename T2> -inline maskfor<common_type<T1, T2>> greaterorequal(const T1& x, const T2& y) -{ - return x >= y; -} -KFR_FN(equal) -KFR_FN(notequal) -KFR_FN(less) -KFR_FN(greater) -KFR_FN(lessorequal) -KFR_FN(greaterorequal) - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::equal, E1, E2> operator==(E1&& e1, E2&& e2) -{ - return { fn::equal(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::notequal, E1, E2> operator!=(E1&& e1, E2&& e2) -{ - return { fn::notequal(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::less, E1, E2> operator<(E1&& e1, E2&& e2) -{ - return { fn::less(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::greater, E1, E2> operator>(E1&& e1, E2&& e2) -{ - return { fn::greater(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::lessorequal, E1, E2> operator<=(E1&& e1, E2&& e2) -{ - return { fn::lessorequal(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::greaterorequal, E1, E2> operator>=(E1&& e1, E2&& e2) -{ - return { fn::greaterorequal(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename T, size_t N> -CMT_INLINE mask<T, N> isnan(const vec<T, N>& x) -{ - return x != x; -} - -template <typename T, size_t N> -CMT_INLINE mask<T, N> isinf(const vec<T, N>& x) -{ - return x == constants<T>::infinity || x == -constants<T>::infinity; -} - -template <typename T, size_t N> -CMT_INLINE mask<T, N> isfinite(const vec<T, N>& x) -{ - return !isnan(x) && !isinf(x); -} - -template <typename T, size_t N> -CMT_INLINE mask<T, N> isnegative(const vec<T, N>& x) -{ - return (x & constants<T>::highbitmask()) != 0; -} - -template <typename T, size_t N> -CMT_INLINE mask<T, N> ispositive(const vec<T, N>& x) -{ - return !isnegative(x); -} - -template <typename T, size_t N> -CMT_INLINE mask<T, N> iszero(const vec<T, N>& x) -{ - return x == T(); -} - -template <typename T1, typename T2, typename T3> -KFR_SINTRIN maskfor<common_type<T1, T2, T3>> inrange(const T1& x, const T2& min, const T3& max) -{ - return x >= min && x <= max; -} -} // namespace kfr diff --git a/include/kfr/base/compiletime.hpp b/include/kfr/base/compiletime.hpp @@ -1,84 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "constants.hpp" -#include "operators.hpp" -#include "types.hpp" - -namespace kfr -{ - -namespace compiletime -{ - -template <typename T> -constexpr inline T select(bool c, T x, T y) -{ - return c ? x : y; -} -template <typename T> -constexpr inline T trunc(T x) -{ - return static_cast<T>(static_cast<long long>(x)); -} -template <typename T> -constexpr inline T abs(T x) -{ - return x < T() ? -x : x; -} -template <typename T> -constexpr inline T mulsign(T x, T y) -{ - return y < T() ? -x : x; -} -template <typename T> -constexpr inline T sin(T x) -{ - x = x - trunc(x / c_pi<T, 2>) * c_pi<T, 2>; - constexpr T c2 = -0.16665853559970855712890625; - constexpr T c4 = +8.31427983939647674560546875e-3; - constexpr T c6 = -1.85423981747590005397796630859375e-4; - - x -= c_pi<T>; - T y = abs(x); - y = select(y > c_pi<T, 1, 2>, c_pi<T> - y, y); - y = mulsign(y, -x); - - const T y2 = y * y; - T formula = c6; - const T y3 = y2 * y; - formula = fmadd(formula, y2, c4); - formula = fmadd(formula, y2, c2); - formula = formula * y3 + y; - return formula; -} -template <typename T> -constexpr inline T cos(T x) -{ - return sin(x + c_pi<T, 1, 2>); -} -} // namespace compiletime -} // namespace kfr diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp @@ -1,967 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "abs.hpp" -#include "atan.hpp" -#include "constants.hpp" -#include "function.hpp" -#include "hyperbolic.hpp" -#include "log_exp.hpp" -#include "min_max.hpp" -#include "operators.hpp" -#include "select.hpp" -#include "sin_cos.hpp" -#include "sqrt.hpp" - -#ifdef KFR_STD_COMPLEX -#include <complex> -#endif - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4814)) - -namespace kfr -{ -#ifdef KFR_STD_COMPLEX - -template <typename T> -using complex = std::complex<T>; - -#else -#ifndef KFR_CUSTOM_COMPLEX - -/** - * @brief Represents the complex numbers. If KFR_STD_COMPLEX is defined, then kfr::complex is an alias for - * std::complex. - */ -template <typename T> -struct complex -{ - static_assert(is_simd_type<T>::value, "Incorrect type for complex"); - constexpr static bool is_pod = true; - constexpr complex() noexcept = default; - constexpr complex(T re) noexcept : re(re), im(0) {} - constexpr complex(T re, T im) noexcept : re(re), im(im) {} - constexpr complex(const complex&) noexcept = default; - constexpr complex(complex&&) noexcept = default; - template <typename U> - constexpr complex(const complex<U>& other) noexcept - : re(static_cast<T>(other.re)), im(static_cast<T>(other.im)) - { - } - template <typename U> - constexpr complex(complex<U>&& other) noexcept : re(std::move(other.re)), im(std::move(other.im)) - { - } -#ifdef CMT_COMPILER_GNU - constexpr complex& operator=(const complex&) noexcept = default; - constexpr complex& operator=(complex&&) noexcept = default; -#else - complex& operator=(const complex&) = default; - complex& operator=(complex&&) = default; -#endif - constexpr const T& real() const noexcept { return re; } - constexpr const T& imag() const noexcept { return im; } - constexpr void real(T value) noexcept { re = value; } - constexpr void imag(T value) noexcept { im = value; } - T re; - T im; - - KFR_INTRIN friend complex operator+(const complex& x, const complex& y) - { - return (make_vector(x) + make_vector(y))[0]; - } - KFR_INTRIN friend complex operator-(const complex& x, const complex& y) - { - return (make_vector(x) - make_vector(y))[0]; - } - KFR_INTRIN friend complex operator*(const complex& x, const complex& y) - { - return (make_vector(x) * make_vector(y))[0]; - } - KFR_INTRIN friend complex operator/(const complex& x, const complex& y) - { - return (make_vector(x) / make_vector(y))[0]; - } - - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator+(const complex& x, const U& y) - { - return static_cast<C>(x) + static_cast<C>(y); - } - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator-(const complex& x, const U& y) - { - return static_cast<C>(x) - static_cast<C>(y); - } - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator*(const complex& x, const U& y) - { - return static_cast<C>(x) * static_cast<C>(y); - } - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator/(const complex& x, const U& y) - { - return static_cast<C>(x) / static_cast<C>(y); - } - - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator+(const U& x, const complex& y) - { - return static_cast<C>(x) + static_cast<C>(y); - } - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator-(const U& x, const complex& y) - { - return static_cast<C>(x) - static_cast<C>(y); - } - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator*(const U& x, const complex& y) - { - return static_cast<C>(x) * static_cast<C>(y); - } - template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> - KFR_INTRIN friend C operator/(const U& x, const complex& y) - { - return static_cast<C>(x) / static_cast<C>(y); - } - KFR_INTRIN friend complex operator-(const complex& x) { return (-make_vector(x))[0]; } -}; -#endif -#endif -} // namespace kfr -namespace cometa -{ -template <typename T> -struct compound_type_traits<kfr::complex<T>> -{ - constexpr static size_t width = 2; - constexpr static size_t deep_width = width * compound_type_traits<T>::width; - using subtype = T; - using deep_subtype = cometa::deep_subtype<T>; - constexpr static bool is_scalar = false; - constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; - template <typename U> - using rebind = kfr::complex<U>; - template <typename U> - using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>; - - static constexpr subtype at(const kfr::complex<T>& value, size_t index) - { - return index == 0 ? value.real() : value.imag(); - } -}; -} // namespace cometa -namespace kfr -{ - -/// @brief Alias for complex<f32> -using c32 = complex<f32>; - -/// @brief Alias for complex<f64> -using c64 = complex<f64>; - -/// @brief Alias for complex<fbase> -using cbase = complex<fbase>; - -namespace internal -{ -template <typename T> -constexpr inline vec<T, 2> vcomplex(const complex<T>& v) -{ - return vec<T, 2>(v.real(), v.imag()); -} -} // namespace internal - -/// @brief vec<> specialization for complex numbers. Implements all operators -template <typename T, size_t N> -struct vec<complex<T>, N> : private vec<T, 2 * N> -{ - using base = vec<T, 2 * N>; - - using value_type = complex<T>; - constexpr static size_t size() noexcept { return N; } - - using scalar_type = T; - constexpr static size_t scalar_size() noexcept { return 2 * N; } - - using simd_type = typename base::simd_type; - - constexpr vec() noexcept = default; - constexpr vec(const vec&) noexcept = default; - CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default; - template <int = 0> - constexpr vec(const simd_type& simd) noexcept : base(simd) - { - } - KFR_I_CE vec(czeros_t) noexcept : base(czeros) {} - KFR_I_CE vec(cones_t) noexcept : base(cones) {} - KFR_I_CE vec(const value_type& s) noexcept : base(repeat<N>(vec<T, 2>(s.real(), s.imag()))) {} - - template <typename U> - KFR_I_CE vec(const complex<U>& s) noexcept - : base(repeat<N>(vec<T, 2>(static_cast<T>(s.real()), static_cast<T>(s.imag())))) - { - } - template <typename U> - KFR_I_CE vec(const vec<complex<U>, N>& v) noexcept : base(static_cast<vec<T, N * 2>>(v.flatten())) - { - } - - explicit KFR_I_CE vec(const vec<T, N * 2>& v) noexcept : base(v) {} - - // from real - KFR_I_CE vec(const T& r) noexcept : base(interleave(vec<T, N>(r), vec<T, N>(czeros))) {} - // from real - template <typename U, typename = enable_if<std::is_convertible<U, T>::value>> - KFR_I_CE vec(const vec<U, N>& r) noexcept : base(interleave(vec<T, N>(r), vec<T, N>(czeros))) - { - } - - // from list of vectors - template <typename... Us> - KFR_I_CE vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept - : base(internal::vcomplex(s0), internal::vcomplex(s1), - internal::vcomplex(static_cast<value_type>(rest))...) - { - } - - template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(value_type) * N)> - KFR_I_CE static vec frombits(const vec<U, M>& v) noexcept - { - return vec(vec<T, scalar_size()>::frombits(v.flatten())); - } - -#define KFR_B(x) static_cast<const base&>(x) - // math / bitwise / comparison operators - KFR_I_CE friend vec operator+(const vec& x) noexcept { return x; } - KFR_I_CE friend vec operator-(const vec& x) noexcept { return vec(-KFR_B(x)); } - KFR_I_CE friend vec operator~(const vec& x) noexcept { return vec(~KFR_B(x)); } - - KFR_I_CE friend vec operator+(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) + KFR_B(y)); } - KFR_I_CE friend vec operator-(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) - KFR_B(y)); } - CMT_GNU_CONSTEXPR friend vec operator*(const vec& x, const vec& y) noexcept - { - const vec<scalar_type, N* 2> xx = x; - const vec<scalar_type, N* 2> yy = y; - return vec(subadd(xx * dupeven(yy), swap<2>(xx) * dupodd(yy))); - } - CMT_GNU_CONSTEXPR friend vec operator/(const vec& x, const vec& y) noexcept - { - const vec<scalar_type, N* 2> xx = x; - const vec<scalar_type, N* 2> yy = y; - const vec<scalar_type, N* 2> m = (sqr(dupeven(yy)) + sqr(dupodd(yy))); - return vec(swap<2>(subadd(swap<2>(xx) * dupeven(yy), xx * dupodd(yy)) / m)); - } - - KFR_I_CE friend vec operator&(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) & KFR_B(y)); } - KFR_I_CE friend vec operator|(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) | KFR_B(y)); } - KFR_I_CE friend vec operator^(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) ^ KFR_B(y)); } - - KFR_I_CE friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; } - KFR_I_CE friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; } - KFR_I_CE friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; } - KFR_I_CE friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; } - - KFR_I_CE friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; } - KFR_I_CE friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; } - KFR_I_CE friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; } - - KFR_I_CE friend vec& operator++(vec& x) noexcept { return x = x + vec(1); } - KFR_I_CE friend vec& operator--(vec& x) noexcept { return x = x - vec(1); } - KFR_I_CE friend vec operator++(vec& x, int) noexcept - { - const vec z = x; - ++x; - return z; - } - KFR_I_CE friend vec operator--(vec& x, int) noexcept - { - const vec z = x; - --x; - return z; - } - - // shuffle - template <size_t... indices> - KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept - { - return *base::shuffle(scale<2, indices...>()); - } - template <size_t... indices> - KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept - { - return *base::shuffle(y, scale<2, indices...>()); - } - - // element access - struct element; - KFR_I_CE value_type operator[](size_t index) const noexcept { return get(index); } - KFR_I_CE element operator[](size_t index) noexcept { return { *this, index }; } - - KFR_I_CE value_type get(size_t index) const noexcept - { - return reinterpret_cast<const value_type(&)[N]>(*this)[index]; - } - KFR_I_CE void set(size_t index, const value_type& s) noexcept - { - reinterpret_cast<value_type(&)[N]>(*this)[index] = s; - } - template <size_t index> - KFR_I_CE value_type get(csize_t<index>) const noexcept - { - return static_cast<const base&>(*this).shuffle(csizeseq_t<2, index * 2>()); - } - template <size_t index> - KFR_I_CE void set(csize_t<index>, const value_type& s) noexcept - { - *this = vec(static_cast<const base&>(*this)) - .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * 2>() && - csizeseq_t<N>() < csize_t<(index + 1) * 2>()) * - N); - } - struct element - { - KFR_I_CE operator value_type() const noexcept { return v.get(index); } - element& operator=(const value_type& s) noexcept - { - v.set(index, s); - return *this; - } - - element& operator=(const element& s) noexcept - { - v.set(index, static_cast<value_type>(s)); - return *this; - } - template <typename U, size_t M> - element& operator=(const typename vec<U, M>::element& s) noexcept - { - v.set(index, static_cast<value_type>(static_cast<U>(s))); - return *this; - } - - vec& v; - size_t index; - }; - - template <bool aligned = false> - explicit KFR_I_CE vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept - : base(ptr_cast<T>(src), cbool_t<aligned>()) - { - } - template <bool aligned = false> - const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept - { - base::write(ptr_cast<T>(dest), cbool_t<aligned>()); - return *this; - } - - const base& flatten() const noexcept { return *this; } - simd_type operator*() const noexcept { return base::operator*(); } - simd_type& operator*() noexcept { return base::operator*(); } -}; - -/// @brief Returns vector of complex values with real part duplicated -template <typename T, size_t N> -CMT_INLINE vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x) -{ - return compcast<complex<T>>(dupeven(compcast<T>(x))); -} -KFR_FN(cdupreal) - -/// @brief Returns vector of complex values with imaginary part duplicated -template <typename T, size_t N> -CMT_INLINE vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x) -{ - return compcast<complex<T>>(dupodd(compcast<T>(x))); -} -KFR_FN(cdupimag) - -/// @brief Returns vector of complex values with real and imaginary parts swapped -template <typename T, size_t N> -CMT_INLINE vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x) -{ - return compcast<complex<T>>(swap<2>(compcast<T>(x))); -} -KFR_FN(cswapreim) - -/// @brief Returns vector of complex values with real part negated -template <typename T, size_t N> -CMT_INLINE vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x) -{ - return x ^ complex<T>(-T(), T()); -} -KFR_FN(cnegreal) - -/// @brief Returns vector of complex values with imaginary part negated -template <typename T, size_t N> -CMT_INLINE vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x) -{ - return x ^ complex<T>(T(), -T()); -} -KFR_FN(cnegimag) - -namespace internal -{ -template <typename T> -struct is_complex_impl : std::false_type -{ -}; -template <typename T> -struct is_complex_impl<complex<T>> : std::true_type -{ -}; - -// vector<complex> to vector<complex> -template <typename To, typename From, size_t N> -struct conversion<vec<complex<To>, N>, vec<complex<From>, N>> -{ - static_assert(!is_compound<To>::value, ""); - static_assert(!is_compound<From>::value, ""); - static vec<complex<To>, N> cast(const vec<complex<From>, N>& value) - { - return builtin_convertvector<complex<To>>(value); - } -}; - -// vector to vector<complex> -template <typename To, typename From, size_t N> -struct conversion<vec<complex<To>, N>, vec<From, N>> -{ - static_assert(!is_compound<To>::value, ""); - static_assert(!is_compound<From>::value, ""); - static vec<complex<To>, N> cast(const vec<From, N>& value) - { - const vec<To, N> casted = static_cast<vec<To, N>>(value); - return *interleave(casted, zerovector(casted)); - } -}; - -} // namespace internal - -template <typename T, size_t N> -constexpr CMT_INLINE vec<complex<T>, N / 2> ccomp(const vec<T, N>& x) -{ - return compcast<complex<T>>(x); -} - -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N * 2> cdecom(const vec<complex<T>, N>& x) -{ - return compcast<T>(x); -} - -/// @brief Returns the real part of the complex value -template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)> -constexpr CMT_INLINE T real(const T& value) -{ - return value; -} - -/// @brief Returns the real part of the complex value -template <typename T> -constexpr CMT_INLINE T real(const complex<T>& value) -{ - return value.real(); -} - -/// @brief Returns the real part of the complex value -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> real(const vec<complex<T>, N>& value) -{ - return even(compcast<T>(value)); -} - -template <typename T> -using realtype = decltype(kfr::real(std::declval<T>())); -template <typename T> -using realftype = ftype<decltype(kfr::real(std::declval<T>()))>; - -KFR_FN(real) - -/// @brief Returns the real part of the complex value -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::real, E1> real(E1&& x) -{ - return { {}, std::forward<E1>(x) }; -} - -/// @brief Returns the imaginary part of the complex value -template <typename T> -constexpr CMT_INLINE T imag(const complex<T>& value) -{ - return value.imag(); -} - -/// @brief Returns the imaginary part of the complex value -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> imag(const vec<complex<T>, N>& value) -{ - return odd(compcast<T>(value)); -} -KFR_FN(imag) - -/// @brief Returns the imaginary part of the complex value -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::imag, E1> imag(E1&& x) -{ - return { {}, std::forward<E1>(x) }; -} - -/// @brief Constructs complex value from real and imaginary parts -template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>> -constexpr CMT_INLINE vec<complex<T>, N> make_complex(const vec<T1, N>& real, const vec<T2, N>& imag = T2(0)) -{ - return compcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag))); -} - -/// @brief Constructs complex value from real and imaginary parts -template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>> -constexpr CMT_INLINE complex<T> make_complex(T1 real, T2 imag = T2(0)) -{ - return complex<T>(cast<T>(real), cast<T>(imag)); -} - -namespace intrinsics -{ -template <typename T, size_t N> -CMT_INLINE vec<complex<T>, N> cconj(const vec<complex<T>, N>& x) -{ - return cnegimag(x); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> csin(const vec<complex<T>, N>& x) -{ - return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> csinh(const vec<complex<T>, N>& x) -{ - return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x)))); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> ccos(const vec<complex<T>, N>& x) -{ - return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))))); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x) -{ - return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x)))); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cabs(const vec<complex<T>, N>& x) -{ - const vec<T, N* 2> xx = sqr(cdecom(x)); - return sqrt(even(xx) + odd(xx)); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> carg(const vec<complex<T>, N>& x) -{ - const vec<T, N* 2> xx = cdecom(x); - return atan2(even(xx), odd(xx)); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> clog(const vec<complex<T>, N>& x) -{ - return make_complex(log(cabs(x)), carg(x)); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> clog2(const vec<complex<T>, N>& x) -{ - return clog(x) * c_recip_log_2<T>; -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> clog10(const vec<complex<T>, N>& x) -{ - return clog(x) * c_recip_log_10<T>; -} - -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> cexp(const vec<complex<T>, N>& x) -{ - return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x)))); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x) -{ - return cexp(x * c_log_2<T>); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x) -{ - return cexp(x * c_log_10<T>); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> polar(const vec<complex<T>, N>& x) -{ - return make_complex(cabs(x), carg(x)); -} -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x) -{ - return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x)))); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cabsdup(const vec<T, N>& x) -{ - x = sqr(x); - return sqrt(x + swap<2>(x)); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x) -{ - const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5); - return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t)); -} - -KFR_I_CONVERTER(cconj) -KFR_I_CONVERTER(csin) -KFR_I_CONVERTER(csinh) -KFR_I_CONVERTER(ccos) -KFR_I_CONVERTER(ccosh) -KFR_I_CONVERTER(clog) -KFR_I_CONVERTER(clog2) -KFR_I_CONVERTER(clog10) -KFR_I_CONVERTER(cexp) -KFR_I_CONVERTER(cexp2) -KFR_I_CONVERTER(cexp10) -KFR_I_CONVERTER(polar) -KFR_I_CONVERTER(cartesian) -KFR_I_CONVERTER(csqrt) - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cabs(const vec<T, N>& a) -{ - return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a))); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> carg(const vec<T, N>& a) -{ - return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a))); -} -template <typename T1> -KFR_SINTRIN realtype<T1> cabs(const T1& a) -{ - using vecout = vec1<T1>; - return to_scalar(intrinsics::cabs(vecout(a))); -} -template <typename T1> -KFR_SINTRIN realtype<T1> carg(const T1& a) -{ - using vecout = vec1<T1>; - return to_scalar(intrinsics::carg(vecout(a))); -} -} // namespace intrinsics - -KFR_I_FN(cconj) -KFR_I_FN(csin) -KFR_I_FN(csinh) -KFR_I_FN(ccos) -KFR_I_FN(ccosh) -KFR_I_FN(cabs) -KFR_I_FN(carg) -KFR_I_FN(clog) -KFR_I_FN(clog2) -KFR_I_FN(clog10) -KFR_I_FN(cexp) -KFR_I_FN(cexp2) -KFR_I_FN(cexp10) -KFR_I_FN(polar) -KFR_I_FN(cartesian) -KFR_I_FN(csqrt) - -/// @brief Returns the sine of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 csin(const T1& x) -{ - return intrinsics::csin(x); -} - -/// @brief Returns template expression that returns the sine of the the complex value x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::csin, E1> csin(E1&& x) -{ - return { fn::csin(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic sine of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 csinh(const T1& x) -{ - return intrinsics::csinh(x); -} - -/// @brief Returns template expression that returns the hyperbolic sine of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::csinh, E1> csinh(E1&& x) -{ - return { fn::csinh(), std::forward<E1>(x) }; -} - -/// @brief Returns the cosine of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 ccos(const T1& x) -{ - return intrinsics::ccos(x); -} - -/// @brief Returns template expression that returns the cosine of the the complex value x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::ccos, E1> ccos(E1&& x) -{ - return { fn::ccos(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic cosine of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 ccosh(const T1& x) -{ - return intrinsics::ccosh(x); -} - -/// @brief Returns template expression that returns the hyperbolic cosine of the the complex value x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::ccosh, E1> ccosh(E1&& x) -{ - return { fn::ccosh(), std::forward<E1>(x) }; -} - -/// @brief Returns the absolute value (magnitude) of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC realtype<T1> cabs(const T1& x) -{ - return intrinsics::cabs(x); -} - -/// @brief Returns template expression that returns the absolute value (magnitude) of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cabs, E1> cabs(E1&& x) -{ - return { fn::cabs(), std::forward<E1>(x) }; -} - -/// @brief Returns the phase angle (argument) of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC realtype<T1> carg(const T1& x) -{ - return intrinsics::carg(x); -} - -/// @brief Returns template expression that returns the phase angle (argument) of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::carg, E1> carg(E1&& x) -{ - return { fn::carg(), std::forward<E1>(x) }; -} - -/// @brief Returns the complex conjugate of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 cconj(const T1& x) -{ - return intrinsics::cconj(x); -} - -/// @brief Returns template expression that returns the complex conjugate of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cconj, E1> cconj(E1&& x) -{ - return { fn::cconj(), std::forward<E1>(x) }; -} - -/// @brief Returns the natural logarithm of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 clog(const T1& x) -{ - return intrinsics::clog(x); -} - -/// @brief Returns template expression that returns the natural logarithm of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::clog, E1> clog(E1&& x) -{ - return { fn::clog(), std::forward<E1>(x) }; -} - -/// @brief Returns the binary (base-2) logarithm of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 clog2(const T1& x) -{ - return intrinsics::clog2(x); -} - -/// @brief Returns template expression that returns the binary (base-2) logarithm of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::clog2, E1> clog2(E1&& x) -{ - return { fn::clog2(), std::forward<E1>(x) }; -} - -/// @brief Returns the common (base-10) logarithm of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 clog10(const T1& x) -{ - return intrinsics::clog10(x); -} - -/// @brief Returns template expression that returns the common (base-10) logarithm of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::clog10, E1> clog10(E1&& x) -{ - return { fn::clog10(), std::forward<E1>(x) }; -} - -/// @brief Returns \f$e\f$ raised to the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 cexp(const T1& x) -{ - return intrinsics::cexp(x); -} - -/// @brief Returns template expression that returns \f$e\f$ raised to the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cexp, E1> cexp(E1&& x) -{ - return { fn::cexp(), std::forward<E1>(x) }; -} - -/// @brief Returns 2 raised to the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 cexp2(const T1& x) -{ - return intrinsics::cexp2(x); -} - -/// @brief Returns template expression that returns 2 raised to the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cexp2, E1> cexp2(E1&& x) -{ - return { fn::cexp2(), std::forward<E1>(x) }; -} - -/// @brief Returns 10 raised to the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 cexp10(const T1& x) -{ - return intrinsics::cexp10(x); -} - -/// @brief Returns template expression that returns 10 raised to the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cexp10, E1> cexp10(E1&& x) -{ - return { fn::cexp10(), std::forward<E1>(x) }; -} - -/// @brief Converts complex number to polar -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 polar(const T1& x) -{ - return intrinsics::polar(x); -} - -/// @brief Returns template expression that converts complex number to polar -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::polar, E1> polar(E1&& x) -{ - return { fn::polar(), std::forward<E1>(x) }; -} - -/// @brief Converts complex number to cartesian -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 cartesian(const T1& x) -{ - return intrinsics::cartesian(x); -} - -/// @brief Returns template expression that converts complex number to cartesian -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cartesian, E1> cartesian(E1&& x) -{ - return { fn::cartesian(), std::forward<E1>(x) }; -} - -/// @brief Returns square root of the complex number x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 csqrt(const T1& x) -{ - return intrinsics::csqrt(x); -} - -/// @brief Returns template expression that returns square root of the complex number x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::csqrt, E1> csqrt(E1&& x) -{ - return { fn::csqrt(), std::forward<E1>(x) }; -} -} // namespace kfr - -namespace std -{ -template <typename T1, typename T2> -struct common_type<kfr::complex<T1>, kfr::complex<T2>> -{ - using type = kfr::complex<typename common_type<T1, T2>::type>; -}; -template <typename T1, typename T2> -struct common_type<kfr::complex<T1>, T2> -{ - using type = kfr::complex<typename common_type<T1, T2>::type>; -}; -template <typename T1, typename T2> -struct common_type<T1, kfr::complex<T2>> -{ - using type = kfr::complex<typename common_type<T1, T2>::type>; -}; -template <typename T1, typename T2, size_t N> -struct common_type<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>> -{ - using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; -}; -template <typename T1, typename T2, size_t N> -struct common_type<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>> -{ - using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; -}; -template <typename T1, typename T2, size_t N> -struct common_type<kfr::complex<T1>, kfr::vec<T2, N>> -{ - using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; -}; -template <typename T1, typename T2, size_t N> -struct common_type<kfr::vec<T1, N>, kfr::complex<T2>> -{ - using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; -}; -} // namespace std - -CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/constants.hpp b/include/kfr/base/constants.hpp @@ -1,299 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "types.hpp" -#include <limits> - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4309)) -CMT_PRAGMA_MSVC(warning(disable : 4146)) - -namespace kfr -{ - -#if CMT_COMPILER_GNU -constexpr double infinity = __builtin_inf(); -constexpr double qnan = __builtin_nan(""); -#else -constexpr double infinity = HUGE_VAL; -constexpr double qnan = NAN; -#endif - -template <typename T> -struct constants -{ -public: - using Tsub = subtype<T>; - - constexpr static Tsub pi_s(int m, int d = 1) { return pi * m / d; } - constexpr static Tsub recip_pi_s(int m, int d = 1) { return recip_pi * m / d; } - - constexpr static Tsub pi = static_cast<Tsub>(3.1415926535897932384626433832795); - constexpr static Tsub sqr_pi = static_cast<Tsub>(9.8696044010893586188344909998762); - constexpr static Tsub recip_pi = static_cast<Tsub>(0.31830988618379067153776752674503); - constexpr static Tsub degtorad = static_cast<Tsub>(pi / 180); - constexpr static Tsub radtodeg = static_cast<Tsub>(pi * 180); - constexpr static Tsub e = static_cast<Tsub>(2.718281828459045235360287471352662); - constexpr static Tsub recip_log_2 = static_cast<Tsub>(1.442695040888963407359924681001892137426645954); - constexpr static Tsub recip_log_10 = static_cast<Tsub>(0.43429448190325182765112891891661); - constexpr static Tsub log_2 = static_cast<Tsub>(0.69314718055994530941723212145818); - constexpr static Tsub log_10 = static_cast<Tsub>(2.3025850929940456840179914546844); - constexpr static Tsub sqrt_2 = static_cast<Tsub>(1.4142135623730950488016887242097); - - constexpr static Tsub fold_constant_div = choose_const<Tsub>( - CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01)); - - constexpr static Tsub fold_constant_hi = choose_const<Tsub>( - CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01)); - constexpr static Tsub fold_constant_rem1 = - choose_const<Tsub>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f), - CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08)); - constexpr static Tsub fold_constant_rem2 = - choose_const<Tsub>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f), - CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15)); - - constexpr static Tsub epsilon = std::numeric_limits<Tsub>::epsilon(); - constexpr static Tsub infinity = std::numeric_limits<Tsub>::infinity(); - constexpr static Tsub neginfinity = -std::numeric_limits<Tsub>::infinity(); - constexpr static Tsub qnan = std::numeric_limits<Tsub>::quiet_NaN(); - -#if CMT_COMPILER_GNU - - CMT_PRAGMA_GNU(GCC diagnostic push) - CMT_PRAGMA_GNU(GCC diagnostic ignored "-Woverflow") - - constexpr static Tsub allones() - { - if (is_same<Tsub, f32>::value) - { - return -__builtin_nanf("0xFFFFFFFF"); - } - else if (is_same<Tsub, f64>::value) - { - return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); - } - else - { - return static_cast<Tsub>(-1ll); - } - } - - constexpr static Tsub allzeros() { return Tsub(0); } - - constexpr static Tsub highbitmask() - { - if (is_same<Tsub, f32>::value) - { - return -0.0f; - } - else if (is_same<Tsub, f64>::value) - { - return -0.0; - } - else - { - return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1)); - } - } - - constexpr static Tsub invhighbitmask() - { - if (is_same<Tsub, f32>::value) - { - return __builtin_nanf("0xFFFFFFFF"); - } - else if (is_same<Tsub, f64>::value) - { - return __builtin_nan("0xFFFFFFFFFFFFFFFF"); - } - else - { - return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1); - } - } - CMT_PRAGMA_GNU(GCC diagnostic pop) -#else - - static Tsub allones() - { - if (is_same<Tsub, f32>::value) - { - return static_cast<Tsub>(bitcast<f32>(0xFFFFFFFFu)); - } - else if (is_same<Tsub, f64>::value) - { - return static_cast<Tsub>(bitcast<f64>(0xFFFFFFFFFFFFFFFFull)); - } - else - { - return static_cast<Tsub>(-1ll); - } - } - - constexpr static Tsub allzeros() { return Tsub(0); } - - static Tsub highbitmask() - { - if (is_same<Tsub, f32>::value) - { - return static_cast<Tsub>(-0.0f); - } - else if (is_same<Tsub, f64>::value) - { - return static_cast<Tsub>(-0.0); - } - else - { - return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1)); - } - } - - static Tsub invhighbitmask() - { - if (is_same<Tsub, f32>::value) - { - return static_cast<Tsub>(bitcast<f32>(0x7FFFFFFFu)); - } - else if (is_same<Tsub, f64>::value) - { - return static_cast<Tsub>(bitcast<f64>(0x7FFFFFFFFFFFFFFFull)); - } - else - { - return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1); - } - } -#endif -}; - -template <typename T> -constexpr subtype<T> constants<T>::pi; -template <typename T> -constexpr subtype<T> constants<T>::sqr_pi; -template <typename T> -constexpr subtype<T> constants<T>::recip_pi; -template <typename T> -constexpr subtype<T> constants<T>::degtorad; -template <typename T> -constexpr subtype<T> constants<T>::radtodeg; -template <typename T> -constexpr subtype<T> constants<T>::e; -template <typename T> -constexpr subtype<T> constants<T>::recip_log_2; -template <typename T> -constexpr subtype<T> constants<T>::recip_log_10; -template <typename T> -constexpr subtype<T> constants<T>::log_2; -template <typename T> -constexpr subtype<T> constants<T>::log_10; -template <typename T> -constexpr subtype<T> constants<T>::sqrt_2; -template <typename T> -constexpr subtype<T> constants<T>::fold_constant_div; -template <typename T> -constexpr subtype<T> constants<T>::fold_constant_hi; -template <typename T> -constexpr subtype<T> constants<T>::fold_constant_rem1; -template <typename T> -constexpr subtype<T> constants<T>::fold_constant_rem2; -template <typename T> -constexpr subtype<T> constants<T>::epsilon; -template <typename T> -constexpr subtype<T> constants<T>::infinity; -template <typename T> -constexpr subtype<T> constants<T>::neginfinity; -template <typename T> -constexpr subtype<T> constants<T>::qnan; - -/// π (pi) -/// c_pi<f64, 4> = 4pi -/// c_pi<f64, 3, 4> = 3/4pi -template <typename T, int m = 1, int d = 1> -constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d); - -/// π² (pi²) -/// c_sqr_pi<f64, 4> = 4pi² -/// c_sqr_pi<f64, 3, 4> = 3/4pi² -template <typename T, int m = 1, int d = 1> -constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d); - -/// 1/π (1/pi) -/// c_recip_pi<f64> 1/pi -/// c_recip_pi<f64, 4> 4/pi -template <typename T, int m = 1, int d = 1> -constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d); - -/// degree to radian conversion factor -template <typename T> -constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>; - -/// radian to degree conversion factor -template <typename T> -constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>; - -/// e, Euler's number -template <typename T, int m = 1, int d = 1> -constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d); - -template <typename T> -constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52; - -template <typename T> -constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1; - -template <typename T> -constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon()); - -/// infinity -template <typename T> -constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity(); - -/// -infinity -template <typename T> -constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity(); - -/// Quiet NaN -template <typename T> -constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN(); - -template <typename T> -constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954); - -template <typename T> -constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661); - -template <typename T> -constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818); - -template <typename T> -constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844); - -template <typename T, int m = 1, int d = 1> -constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d); -} // namespace kfr - -CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp @@ -1,4 +1,4 @@ -/** @addtogroup math +/** @addtogroup conversion * @{ */ /* @@ -25,12 +25,15 @@ */ #pragma once -#include "types.hpp" +#include "../math/clamp.hpp" +#include "../simd/types.hpp" +#include "../simd/vec.hpp" #include "univector.hpp" -#include "vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ enum class audio_sample_type { @@ -179,7 +182,7 @@ template <typename Tout, typename Tin, typename Tout_traits = audio_sample_trait inline Tout convert_sample(const Tin& in) { constexpr auto scale = Tout_traits::scale / Tin_traits::scale; - return cast<Tout>(clamp(in * scale, -Tout_traits::scale, +Tout_traits::scale)); + return innercast<Tout>(clamp(in * scale, -Tout_traits::scale, +Tout_traits::scale)); } /// @brief Deinterleaves and converts audio samples @@ -275,4 +278,5 @@ void convert(void* out, audio_sample_type out_type, const Tin* in, size_t size) convert(reinterpret_cast<type*>(out), in, size); }); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp @@ -1,107 +0,0 @@ -/** @addtogroup shuffle - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "shuffle.hpp" -#include "types.hpp" - -namespace kfr -{ - -namespace internal -{ - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative") - -constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; } - -template <u32 m, u32 shift, u32... values> -constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>) -{ - return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>()); -} - -template <size_t bits> -constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>) -{ - return bit_permute_step_impl( - x, - cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >> - (32 - bits); -} - -template <size_t bits> -constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>) -{ - return bit_permute_step_impl( - x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >> - (32 - bits); -} - -CMT_PRAGMA_GNU(GCC diagnostic pop) - -template <size_t radix, size_t bits> -struct shuffle_index_digitreverse -{ - constexpr inline size_t operator()(size_t index) const noexcept - { - return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>()); - } -}; -} // namespace internal - -template <size_t radix, size_t group = 1, typename T, size_t N> -CMT_INLINE vec<T, N> digitreverse(const vec<T, N>& x) -{ - return x.shuffle(scale<group>( - csizeseq_t<N / group>().map(internal::shuffle_index_digitreverse<radix, ilog2(N / group)>()))); -} - -template <size_t groupsize = 1, typename T, size_t N> -CMT_INLINE vec<T, N> bitreverse(const vec<T, N>& x) -{ - return digitreverse<2, groupsize>(x); -} - -template <size_t groupsize = 1, typename T, size_t N> -CMT_INLINE vec<T, N> digitreverse4(const vec<T, N>& x) -{ - return digitreverse<4, groupsize>(x); -} - -template <size_t bits> -constexpr inline u32 bitreverse(u32 x) -{ - return internal::digitreverse_impl<bits>(x, csize_t<2>()); -} - -template <size_t bits> -constexpr inline u32 digitreverse4(u32 x) -{ - return internal::digitreverse_impl<bits>(x, csize_t<4>()); -} -} // namespace kfr diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp @@ -25,9 +25,10 @@ */ #pragma once -#include "platform.hpp" -#include "types.hpp" -#include "vec.hpp" +#include "../simd/platform.hpp" +#include "../simd/shuffle.hpp" +#include "../simd/types.hpp" +#include "../simd/vec.hpp" #include <tuple> #ifdef KFR_STD_COMPLEX @@ -36,9 +37,12 @@ CMT_PRAGMA_GNU(GCC diagnostic push) CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses") namespace kfr { +inline namespace CMT_ARCH_NAME +{ constexpr size_t inout_context_size = 16; @@ -73,20 +77,20 @@ struct complex; constexpr size_t infinite_size = static_cast<size_t>(-1); -CMT_INLINE constexpr size_t size_add(size_t x, size_t y) +CMT_INTRINSIC constexpr size_t size_add(size_t x, size_t y) { return (x == infinite_size || y == infinite_size) ? infinite_size : x + y; } -CMT_INLINE constexpr size_t size_sub(size_t x, size_t y) +CMT_INTRINSIC constexpr size_t size_sub(size_t x, size_t y) { return (x == infinite_size || y == infinite_size) ? infinite_size : (x > y ? x - y : 0); } -CMT_INLINE constexpr size_t size_min(size_t x) noexcept { return x; } +CMT_INTRINSIC constexpr size_t size_min(size_t x) CMT_NOEXCEPT { return x; } template <typename... Ts> -CMT_INLINE constexpr size_t size_min(size_t x, size_t y, Ts... rest) noexcept +CMT_INTRINSIC constexpr size_t size_min(size_t x, size_t y, Ts... rest) CMT_NOEXCEPT { return size_min(x < y ? x : y, rest...); } @@ -94,23 +98,23 @@ CMT_INLINE constexpr size_t size_min(size_t x, size_t y, Ts... rest) noexcept /// @brief Base class of all input expressoins struct input_expression { - CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; } + KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; } constexpr static bool is_incremental = false; - CMT_INLINE constexpr void begin_block(cinput_t, size_t) const {} - CMT_INLINE constexpr void end_block(cinput_t, size_t) const {} + KFR_MEM_INTRINSIC constexpr void begin_block(cinput_t, size_t) const {} + KFR_MEM_INTRINSIC constexpr void end_block(cinput_t, size_t) const {} }; /// @brief Base class of all output expressoins struct output_expression { - CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; } + KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; } constexpr static bool is_incremental = false; - CMT_INLINE constexpr void begin_block(coutput_t, size_t) const {} - CMT_INLINE constexpr void end_block(coutput_t, size_t) const {} + KFR_MEM_INTRINSIC constexpr void begin_block(coutput_t, size_t) const {} + KFR_MEM_INTRINSIC constexpr void end_block(coutput_t, size_t) const {} }; /// @brief Check if the type argument is an input expression @@ -141,17 +145,14 @@ using is_numeric_args = and_t<is_numeric<Ts>...>; namespace internal { template <typename T, size_t N, typename Fn> -static vec<T, N> get_fn_value(size_t index, Fn&& fn) +inline vec<T, N> get_fn_value(size_t index, Fn&& fn) { - vec<T, N> x; - for (size_t i = 0; i < N; i++) - x[i] = fn(index + i); - return x; + return apply(fn, enumerate<size_t, N>() + index); } } // namespace internal template <typename E, typename Fn> -static void test_expression(const E& expr, size_t size, Fn&& fn, const char* expression = nullptr) +void test_expression(const E& expr, size_t size, Fn&& fn, const char* expression = nullptr) { using T = value_type_of<E>; ::testo::test_case* test = ::testo::active_test(); @@ -159,38 +160,20 @@ static void test_expression(const E& expr, size_t size, Fn&& fn, const char* exp test->check(c <= expr.size() == size, expression); if (expr.size() != size) return; - size = size_min(size, 100); + size = size_min(size, 200); + constexpr size_t maxsize = 2 + ilog2(vector_width<T> * 2); for (size_t i = 0; i < size;) { const size_t next_size = - std::min(prev_poweroftwo(size - i), static_cast<size_t>(1) << (std::rand() % 6)); - switch (next_size) - { - case 1: - test->check(c <= expr(cinput, i, vec_t<T, 1>()) == internal::get_fn_value<T, 1>(i, fn), - expression); - break; - case 2: - test->check(c <= expr(cinput, i, vec_t<T, 2>()) == internal::get_fn_value<T, 2>(i, fn), - expression); - break; - case 4: - test->check(c <= expr(cinput, i, vec_t<T, 4>()) == internal::get_fn_value<T, 4>(i, fn), - expression); - break; - case 8: - test->check(c <= expr(cinput, i, vec_t<T, 8>()) == internal::get_fn_value<T, 8>(i, fn), - expression); - break; - case 16: - test->check(c <= expr(cinput, i, vec_t<T, 16>()) == internal::get_fn_value<T, 16>(i, fn), - expression); - break; - case 32: - test->check(c <= expr(cinput, i, vec_t<T, 32>()) == internal::get_fn_value<T, 32>(i, fn), + std::min(prev_poweroftwo(size - i), static_cast<size_t>(1) << (std::rand() % maxsize)); + + cswitch(csize<1> << csizeseq<maxsize>, next_size, [&](auto x) { + constexpr size_t nsize = val_of(decltype(x)()); + ::testo::scope s(as_string("i = ", i)); + test->check(c <= get_elements(expr, cinput, i, vec_shape<T, nsize>()) == + internal::get_fn_value<T, nsize>(i, fn), expression); - break; - } + }); i += next_size; } } @@ -208,33 +191,26 @@ template <typename T, typename Fn> struct expression_lambda : input_expression { using value_type = T; - CMT_INLINE expression_lambda(Fn&& fn) : fn(std::move(fn)) {} + KFR_MEM_INTRINSIC expression_lambda(Fn&& fn) : fn(std::move(fn)) {} - template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_t<T, N>>::value)> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_shape<T, N>>::value)> + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return fn(cinput, index, y); + return self.fn(cinput, index, y); } template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, size_t>::value)> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t, size_t index, + vec_shape<T, N>) { - vec<T, N> result; - for (size_t i = 0; i < N; i++) - { - result[i] = fn(index + i); - } - return result; + return apply(self.fn, enumerate<size_t, N>() + index); } template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn>::value)> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t, size_t, + vec_shape<T, N>) { - vec<T, N> result; - for (size_t i = 0; i < N; i++) - { - result[i] = fn(); - } - return result; + return apply<N>(self.fn); } Fn fn; @@ -269,19 +245,22 @@ namespace internal { template <typename... Args> -struct expression_base : input_expression +struct expression_with_arguments : input_expression { - CMT_INLINE constexpr size_t size() const noexcept { return size_impl(indicesfor_t<Args...>()); } + KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT + { + return size_impl(indicesfor_t<Args...>()); + } constexpr static size_t count = sizeof...(Args); - expression_base() = delete; - constexpr expression_base(Args&&... args) noexcept : args(std::forward<Args>(args)...) {} + expression_with_arguments() = delete; + constexpr expression_with_arguments(Args&&... args) CMT_NOEXCEPT : args(std::forward<Args>(args)...) {} - CMT_INLINE void begin_block(cinput_t cinput, size_t size) const + KFR_MEM_INTRINSIC void begin_block(cinput_t cinput, size_t size) const { begin_block_impl(cinput, size, indicesfor_t<Args...>()); } - CMT_INLINE void end_block(cinput_t cinput, size_t size) const + KFR_MEM_INTRINSIC void end_block(cinput_t cinput, size_t size) const { end_block_impl(cinput, size, indicesfor_t<Args...>()); } @@ -290,44 +269,48 @@ struct expression_base : input_expression protected: template <size_t... indices> - CMT_INLINE constexpr size_t size_impl(csizes_t<indices...>) const noexcept + KFR_MEM_INTRINSIC constexpr size_t size_impl(csizes_t<indices...>) const CMT_NOEXCEPT { return size_min(std::get<indices>(this->args).size()...); } template <typename Fn, typename T, size_t N> - CMT_INLINE vec<T, N> call(cinput_t cinput, Fn&& fn, size_t index, vec_t<T, N> x) const + KFR_MEM_INTRINSIC vec<T, N> call(cinput_t cinput, Fn&& fn, size_t index, vec_shape<T, N> x) const { return call_impl(cinput, std::forward<Fn>(fn), indicesfor_t<Args...>(), index, x); } template <size_t ArgIndex, typename U, size_t N, typename T = value_type_of<typename details::get_nth_type<ArgIndex, Args...>::type>> - CMT_INLINE vec<U, N> argument(cinput_t cinput, csize_t<ArgIndex>, size_t index, vec_t<U, N>) const + KFR_MEM_INTRINSIC vec<U, N> argument(cinput_t cinput, csize_t<ArgIndex>, size_t index, + vec_shape<U, N>) const { static_assert(ArgIndex < count, "Incorrect ArgIndex"); - return static_cast<vec<U, N>>(std::get<ArgIndex>(this->args)(cinput, index, vec_t<T, N>())); + return get_elements( + static_cast<vec<U, N>>(std::get<ArgIndex>(this->args), cinput, index, vec_shape<T, N>())); } template <typename U, size_t N, typename T = value_type_of<typename details::get_nth_type<0, Args...>::type>> - CMT_INLINE vec<U, N> argument_first(cinput_t cinput, size_t index, vec_t<U, N>) const + KFR_MEM_INTRINSIC vec<U, N> argument_first(cinput_t cinput, size_t index, vec_shape<U, N>) const { - return static_cast<vec<U, N>>(std::get<0>(this->args)(cinput, index, vec_t<T, N>())); + return static_cast<vec<U, N>>( + get_elements(std::get<0>(this->args), cinput, index, vec_shape<T, N>())); } private: template <typename Fn, typename T, size_t N, size_t... indices> - CMT_INLINE vec<T, N> call_impl(cinput_t cinput, Fn&& fn, csizes_t<indices...>, size_t index, - vec_t<T, N>) const + KFR_MEM_INTRINSIC vec<T, N> call_impl(cinput_t cinput, Fn&& fn, csizes_t<indices...>, size_t index, + vec_shape<T, N>) const { - return fn(std::get<indices>(this->args)(cinput, index, vec_t<value_type_of<Args>, N>())...); + return fn(get_elements(std::get<indices>(this->args), cinput, index, + vec_shape<value_type_of<Args>, N>())...); } template <size_t... indices> - CMT_INLINE void begin_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const + KFR_MEM_INTRINSIC void begin_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const { swallow{ (std::get<indices>(args).begin_block(cinput, size), 0)... }; } template <size_t... indices> - CMT_INLINE void end_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const + KFR_MEM_INTRINSIC void end_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const { swallow{ (std::get<indices>(args).end_block(cinput, size), 0)... }; } @@ -338,14 +321,15 @@ struct expression_scalar : input_expression { using value_type = T; expression_scalar() = delete; - constexpr expression_scalar(const T& val) noexcept : val(val) {} - constexpr expression_scalar(const vec<T, width>& val) noexcept : val(val) {} + constexpr expression_scalar(const T& val) CMT_NOEXCEPT : val(val) {} + constexpr expression_scalar(const vec<T, width>& val) CMT_NOEXCEPT : val(val) {} vec<T, width> val; template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_scalar& self, cinput_t, size_t, + vec_shape<T, N>) { - return resize<N>(val); + return resize<N>(self.val); } }; @@ -377,27 +361,30 @@ template <typename T> using arg = typename internal::arg_impl<decay<T>, T>::type; template <typename Fn, typename... Args> -struct expression_function : expression_base<arg<Args>...> +struct expression_function : expression_with_arguments<arg<Args>...> { using value_type = subtype<decltype(std::declval<Fn>()(std::declval<vec<value_type_of<arg<Args>>, 1>>()...))>; using T = value_type; - expression_function(Fn&& fn, arg<Args>&&... args) noexcept - : expression_base<arg<Args>...>(std::forward<arg<Args>>(args)...), fn(std::forward<Fn>(fn)) + expression_function(Fn&& fn, arg<Args>&&... args) CMT_NOEXCEPT + : expression_with_arguments<arg<Args>...>(std::forward<arg<Args>>(args)...), + fn(std::forward<Fn>(fn)) { } - expression_function(const Fn& fn, arg<Args>&&... args) noexcept - : expression_base<arg<Args>...>(std::forward<arg<Args>>(args)...), fn(fn) + expression_function(const Fn& fn, arg<Args>&&... args) CMT_NOEXCEPT + : expression_with_arguments<arg<Args>...>(std::forward<arg<Args>>(args)...), + fn(fn) { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> x) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_function& self, cinput_t cinput, + size_t index, vec_shape<T, N> x) { - return this->call(cinput, fn, index, x); + return self.call(cinput, self.fn, index, x); } - const Fn& get_fn() const noexcept { return fn; } + const Fn& get_fn() const CMT_NOEXCEPT { return fn; } protected: Fn fn; @@ -405,25 +392,25 @@ protected: } // namespace internal template <typename A> -CMT_INLINE internal::arg<A> e(A&& a) +CMT_INTRINSIC internal::arg<A> e(A&& a) { return internal::arg<A>(std::forward<A>(a)); } template <typename T> -CMT_INLINE internal::expression_scalar<T> scalar(const T& val) +CMT_INTRINSIC internal::expression_scalar<T> scalar(const T& val) { return internal::expression_scalar<T>(val); } template <typename T, size_t N> -CMT_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val) +CMT_INTRINSIC internal::expression_scalar<T, N> scalar(const vec<T, N>& val) { return internal::expression_scalar<T, N>(val); } template <typename Fn, typename... Args> -CMT_INLINE internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&& fn, Args&&... args) +CMT_INTRINSIC internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&& fn, Args&&... args) { return internal::expression_function<decay<Fn>, Args...>(std::forward<Fn>(fn), std::forward<Args>(args)...); @@ -434,17 +421,16 @@ CMT_INLINE internal::expression_function<decay<Fn>, Args...> bind_expression(Fn& * @param args new arguments for the function */ template <typename Fn, typename... OldArgs, typename... NewArgs> -CMT_INLINE internal::expression_function<Fn, NewArgs...> rebind( +CMT_INTRINSIC internal::expression_function<Fn, NewArgs...> rebind( const internal::expression_function<Fn, OldArgs...>& e, NewArgs&&... args) { return internal::expression_function<Fn, NewArgs...>(e.get_fn(), std::forward<NewArgs>(args)...); } -template <cpu_t c = cpu_t::native, size_t width = 0, typename OutputExpr, typename InputExpr, - size_t groupsize = 1> -CMT_INLINE static size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 0, - size_t size = infinite_size, coutput_t coutput = nullptr, - cinput_t cinput = nullptr, csize_t<groupsize> = csize_t<groupsize>()) +template <size_t width = 0, typename OutputExpr, typename InputExpr, size_t groupsize = 1> +CMT_INTRINSIC static size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 0, + size_t size = infinite_size, coutput_t coutput = nullptr, + cinput_t cinput = nullptr, csize_t<groupsize> = csize_t<groupsize>()) { using Tin = value_type_of<InputExpr>; static_assert(is_output_expression<OutputExpr>::value, "OutFn must be an expression"); @@ -453,24 +439,25 @@ CMT_INLINE static size_t process(OutputExpr&& out, const InputExpr& in, size_t s size = size_sub(size_min(out.size(), in.size(), size_add(size, start)), start); if (size == 0 || size == infinite_size) return size; - const size_t end = start + size; out.begin_block(coutput, size); in.begin_block(cinput, size); #ifdef NDEBUG - constexpr size_t w = width == 0 ? platform<Tin, c>::vector_capacity / 4 : width; + constexpr size_t w = width == 0 ? maximum_vector_size<Tin> : width; #else - constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width : width; + constexpr size_t w = width == 0 ? vector_width<Tin> : width; #endif + static_assert(w > 0 && is_poweroftwo(w), ""); + size_t i = start; CMT_LOOP_NOUNROLL for (; i < start + size / w * w; i += w) - out(coutput, i, in(cinput, i, vec_t<Tin, w>())); + out(coutput, i, get_elements(in, cinput, i, vec_shape<Tin, w>())); CMT_LOOP_NOUNROLL for (; i < start + size / groupsize * groupsize; i += groupsize) - out(coutput, i, in(cinput, i, vec_t<Tin, groupsize>())); + out(coutput, i, get_elements(in, cinput, i, vec_shape<Tin, groupsize>())); in.end_block(cinput, size); out.end_block(coutput, size); @@ -483,11 +470,12 @@ struct input_expression_base : input_expression virtual ~input_expression_base() {} virtual T input(size_t index) const = 0; template <typename U, size_t N> - CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const + friend KFR_INTRINSIC vec<U, N> get_elements(const input_expression_base& self, cinput_t, size_t index, + vec_shape<U, N>) { vec<U, N> out; for (size_t i = 0; i < N; i++) - out[i] = static_cast<U>(input(index + i)); + out[i] = static_cast<U>(self.input(index + i)); return out; } }; @@ -499,12 +487,19 @@ struct output_expression_base : output_expression virtual void output(size_t index, const T& value) = 0; template <typename U, size_t N> - CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value) + KFR_MEM_INTRINSIC void operator()(coutput_t, size_t index, const vec<U, N>& value) { for (size_t i = 0; i < N; i++) output(index + i, static_cast<T>(value[i])); } }; + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +CMT_INTRINSIC internal::expression_function<fn::interleave, E1, E2> interleave(E1&& x, E2&& y) +{ + return { fn::interleave(), std::forward<E1>(x), std::forward<E2>(y) }; +} +} // namespace CMT_ARCH_NAME } // namespace kfr CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/filter.hpp b/include/kfr/base/filter.hpp @@ -1,4 +1,4 @@ -/** @addtogroup math +/** @addtogroup filter * @{ */ /* @@ -32,6 +32,8 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ /// @brief Abstract base class for filters with one argument. Mainly for DSP template <typename T> @@ -131,16 +133,17 @@ protected: /// @brief Converts expression with placeholder to filter. Placeholder and filter must have the same type template <typename E, typename T = value_type_of<E>> -KFR_SINTRIN expression_filter<T> to_filter(E&& e) +KFR_INTRINSIC expression_filter<T> to_filter(E&& e) { return expression_filter<T>(to_pointer(std::move(e))); } /// @brief Converts expression with placeholder to filter. Placeholder and filter must have the same type template <typename T, typename E> -KFR_SINTRIN expression_filter<T> to_filter(expression_pointer<T>&& e) +KFR_INTRINSIC expression_filter<T> to_filter(expression_pointer<T>&& e) { return expression_filter<T>(std::move(e)); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/fraction.hpp b/include/kfr/base/fraction.hpp @@ -25,8 +25,7 @@ */ #pragma once -#include "operators.hpp" -#include "vec.hpp" +#include "../simd/types.hpp" namespace kfr { diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp @@ -1,268 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "expression.hpp" -#include "shuffle.hpp" -#include "types.hpp" -#include "vec.hpp" - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") - -namespace kfr -{ - -#define KFR_I_CONVERTER(fn) \ - template <typename T1, typename... Args, typename Tout = ::cometa::common_type<T1, Args...>> \ - KFR_SINTRIN Tout fn(const T1& a, const Args&... b) \ - { \ - using vecout = vec1<Tout>; \ - return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ - } - -#define KFR_I_FLT_CONVERTER(fn) \ - template <typename T1, typename... Args, \ - typename Tout = ::kfr::flt_type<::cometa::common_type<T1, Args...>>> \ - KFR_SINTRIN Tout fn(const T1& a, const Args&... b) \ - { \ - using vecout = vec1<Tout>; \ - return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ - } - -namespace intrinsics -{ -#ifdef CMT_ARCH_X86 -using f32sse = vec<f32, 4>; -using f64sse = vec<f64, 2>; -using i8sse = vec<i8, 16>; -using i16sse = vec<i16, 8>; -using i32sse = vec<i32, 4>; -using i64sse = vec<i64, 2>; -using u8sse = vec<u8, 16>; -using u16sse = vec<u16, 8>; -using u32sse = vec<u32, 4>; -using u64sse = vec<u64, 2>; - -using f32avx = vec<f32, 8>; -using f64avx = vec<f64, 4>; -using i8avx = vec<i8, 32>; -using i16avx = vec<i16, 16>; -using i32avx = vec<i32, 8>; -using i64avx = vec<i64, 4>; -using u8avx = vec<u8, 32>; -using u16avx = vec<u16, 16>; -using u32avx = vec<u32, 8>; -using u64avx = vec<u64, 4>; - -using f32avx512 = vec<f32, 16>; -using f64avx512 = vec<f64, 8>; -using i8avx512 = vec<i8, 64>; -using i16avx512 = vec<i16, 32>; -using i32avx512 = vec<i32, 16>; -using i64avx512 = vec<i64, 8>; -using u8avx512 = vec<u8, 64>; -using u16avx512 = vec<u16, 32>; -using u32avx512 = vec<u32, 16>; -using u64avx512 = vec<u64, 8>; - -#else -using f32neon = vec<f32, 4>; -using f64neon = vec<f64, 2>; -using i8neon = vec<i8, 16>; -using i16neon = vec<i16, 8>; -using i32neon = vec<i32, 4>; -using i64neon = vec<i64, 2>; -using u8neon = vec<u8, 16>; -using u16neon = vec<u16, 8>; -using u32neon = vec<u32, 4>; -using u64neon = vec<u64, 2>; -#endif - -template <cpu_t c, typename T> -constexpr inline size_t next_simd_width(size_t n) -{ -#ifdef CMT_ARCH_X86 - return n > platform<T, cpu_t::sse2>::vector_width ? platform<T, c>::vector_width - : platform<T, cpu_t::sse2>::vector_width; -#endif -#ifdef CMT_ARCH_ARM - return platform<T, cpu_t::neon>::vector_width; -#endif -} - -template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)> -KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x) -{ - return extend<Nout>(x); -} - -template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)> -KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) -{ - return widen<Nout>(x, value); -} - -#define KFR_HANDLE_ALL_SIZES_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return slice<0, N>(fn(expand_simd(a))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return concat(fn(low(a)), fn(high(a))); \ - } - -#define KFR_HANDLE_ALL_SIZES_FLT_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ - KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \ - { \ - return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a)))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ - KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \ - { \ - return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a)))); \ - } - -#define KFR_HANDLE_ALL_SIZES_F_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_f_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return slice<0, N>(fn(expand_simd(a))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_f_class<T>::value), \ - typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return concat(fn(low(a)), fn(high(a))); \ - } - -#define KFR_HANDLE_ALL_SIZES_I_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_i_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return slice<0, N>(fn(expand_simd(a))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_i_class<T>::value), \ - typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return concat(fn(low(a)), fn(high(a))); \ - } - -#define KFR_HANDLE_ALL_SIZES_U_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_u_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return slice<0, N>(fn(expand_simd(a))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_u_class<T>::value), \ - typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return concat(fn(low(a)), fn(high(a))); \ - } - -#define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && !is_f_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return slice<0, N>(fn(expand_simd(a))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && !is_f_class<T>::value), \ - typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ - { \ - return concat(fn(low(a)), fn(high(a))); \ - } - -#define KFR_HANDLE_ALL_SIZES_2(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \ - { \ - return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \ - { \ - return concat(fn(low(a), low(b)), fn(high(a), high(b))); \ - } - -#define KFR_HANDLE_ALL_SIZES_2_INT(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, int b) \ - { \ - return slice<0, N>(fn(expand_simd(a), b)); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, int b) \ - { \ - return concat(fn(low(a), b), fn(high(a), b)); \ - } - -#define KFR_HANDLE_ALL_SIZES_3(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \ - { \ - return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \ - { \ - return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c))); \ - } - -#define KFR_HANDLE_ALL_SIZES_4(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \ - { \ - return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d))); \ - } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ - KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \ - { \ - return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d))); \ - } - -template <typename T> -using vec1 = conditional<is_vec<T>::value, T, vec<T, 1>>; - -template <typename T> -inline T to_scalar(const T& value) -{ - return value; -} -template <typename T> -inline T to_scalar(const vec<T, 1>& value) -{ - return value[0]; -} -} // namespace intrinsics -} // namespace kfr -CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/function_expressions.hpp b/include/kfr/base/function_expressions.hpp @@ -0,0 +1,30 @@ +/** @addtogroup expressions + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +namespace kfr +{ +} // namespace kfr diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp @@ -1,60 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/gamma.hpp" - -namespace kfr -{ - -/// @brief Returns the approximate gamma function of an argument -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> gamma(const T1& x) -{ - return intrinsics::gamma(x); -} - -/// @brief Creates expression that returns the approximate gamma function of an argument -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::gamma, E1> gamma(E1&& x) -{ - return { fn::gamma(), std::forward<E1>(x) }; -} - -/// @brief Returns the approximate factorial of an argument -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> factorial_approx(const T1& x) -{ - return intrinsics::factorial_approx(x); -} - -/// @brief Creates expression that returns the approximate factorial of an argument -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::factorial_approx, E1> factorial_approx(E1&& x) -{ - return { fn::factorial_approx(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp @@ -1,4 +1,4 @@ -/** @addtogroup expressions +/** @addtogroup generators * @{ */ /* @@ -25,14 +25,16 @@ */ #pragma once -#include "function.hpp" -#include "log_exp.hpp" -#include "select.hpp" -#include "sin_cos.hpp" -#include "vec.hpp" +#include "../math/log_exp.hpp" +#include "../math/select.hpp" +#include "../math/sin_cos.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ namespace internal { @@ -41,14 +43,15 @@ template <typename T, size_t width_, typename Class> struct generator : input_expression { constexpr static size_t width = width_; - using value_type = T; + using value_type = T; constexpr static bool is_incremental = true; template <typename U, size_t N> - CMT_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N> t) const + friend KFR_INTRINSIC vec<U, N> get_elements(const generator& self, cinput_t, size_t, + vec_shape<U, N> t) { - return generate(t); + return self.generate(t); } void resync(T start) const { ptr_cast<Class>(this)->sync(start); } @@ -70,7 +73,7 @@ protected: } template <size_t N, KFR_ENABLE_IF(N == width)> - CMT_INLINE vec<T, N> generate(vec_t<T, N>) const + KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const { const vec<T, N> result = value; call_next(); @@ -78,7 +81,7 @@ protected: } template <size_t N, KFR_ENABLE_IF(N < width)> - CMT_INLINE vec<T, N> generate(vec_t<T, N>) const + KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const { const vec<T, N> result = narrow<N>(value); shift(csize_t<N>()); @@ -86,7 +89,7 @@ protected: } template <size_t N, KFR_ENABLE_IF(N > width)> - CMT_INLINE vec<T, N> generate(vec_t<T, N> x) const + KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N> x) const { const auto lo = generate(low(x)); const auto hi = generate(high(x)); @@ -96,58 +99,64 @@ protected: mutable vec<T, width> value; }; -template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2)> +template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)> struct generator_linear : generator<T, width, generator_linear<T, width>> { - constexpr generator_linear(T start, T step) noexcept : step(step), vstep(step * width) + generator_linear(T start, T step) CMT_NOEXCEPT : step(step), vstep(step* width) { this->resync(start); } + + KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { - this->resync(start); + this->value = start + enumerate<T, width>() * step; } - CMT_INLINE void sync(T start) const noexcept { this->value = start + enumerate<T, width>() * step; } - - CMT_INLINE void next() const noexcept { this->value += vstep; } + KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += vstep; } protected: T step; T vstep; }; -template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP> +template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP> struct generator_exp : generator<T, width, generator_exp<T, width>> { - generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step * width))[0] - 1) + generator_exp(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp(make_vector(step* width))[0] - 1) { this->resync(start); } - CMT_INLINE void sync(T start) const noexcept { this->value = exp(start + enumerate<T, width>() * step); } + KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT + { + this->value = exp(start + enumerate<T, width>() * step); + } - CMT_INLINE void next() const noexcept { this->value += this->value * vstep; } + KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; } protected: T step; T vstep; }; -template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP> +template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP> struct generator_exp2 : generator<T, width, generator_exp2<T, width>> { - generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step * width))[0] - 1) + generator_exp2(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp2(make_vector(step* width))[0] - 1) { this->resync(start); } - CMT_INLINE void sync(T start) const noexcept { this->value = exp2(start + enumerate<T, width>() * step); } + KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT + { + this->value = exp2(start + enumerate<T, width>() * step); + } - CMT_INLINE void next() const noexcept { this->value += this->value * vstep; } + KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; } protected: T step; T vstep; }; -template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP> +template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP> struct generator_cossin : generator<T, width, generator_cossin<T, width>> { generator_cossin(T start, T step) @@ -155,9 +164,9 @@ struct generator_cossin : generator<T, width, generator_cossin<T, width>> { this->resync(start); } - CMT_INLINE void sync(T start) const noexcept { this->value = init_cossin(step, start); } + KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { this->value = init_cossin(step, start); } - CMT_INLINE void next() const noexcept + KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value = this->value - subadd(alpha * this->value, beta * swap<2>(this->value)); } @@ -172,7 +181,7 @@ protected: } }; -template <typename T, size_t width = platform<T>::vector_width* bitness_const(2, 4), KFR_ARCH_DEP> +template <typename T, size_t width = vector_width<T>* bitness_const(2, 4), KFR_ARCH_DEP> struct generator_sin : generator<T, width, generator_sin<T, width>> { generator_sin(T start, T step) @@ -180,14 +189,14 @@ struct generator_sin : generator<T, width, generator_sin<T, width>> { this->resync(start); } - CMT_INLINE void sync(T start) const noexcept + KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step))); this->cos_value = low(cs); this->value = high(cs); } - CMT_INLINE void next() const noexcept + KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { const vec<T, width> c = this->cos_value; const vec<T, width> s = this->value; @@ -200,7 +209,7 @@ struct generator_sin : generator<T, width, generator_sin<T, width>> } template <size_t N> - void shift(csize_t<N>) const noexcept + void shift(csize_t<N>) const CMT_NOEXCEPT { const vec<T, width> oldvalue = this->value; const vec<T, width> oldcosvalue = this->cos_value; @@ -226,7 +235,7 @@ protected: \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_SINTRIN internal::generator_linear<TF> gen_linear(T1 start, T2 step) +KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step) { return internal::generator_linear<TF>(start, step); } @@ -238,7 +247,7 @@ KFR_SINTRIN internal::generator_linear<TF> gen_linear(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_SINTRIN internal::generator_exp<TF> gen_exp(T1 start, T2 step) +KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step) { return internal::generator_exp<TF>(start, step); } @@ -250,7 +259,7 @@ KFR_SINTRIN internal::generator_exp<TF> gen_exp(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_SINTRIN internal::generator_exp2<TF> gen_exp2(T1 start, T2 step) +KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step) { return internal::generator_exp2<TF>(start, step); } @@ -266,7 +275,7 @@ KFR_SINTRIN internal::generator_exp2<TF> gen_exp2(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_SINTRIN internal::generator_cossin<TF> gen_cossin(T1 start, T2 step) +KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step) { return internal::generator_cossin<TF>(start, step); } @@ -278,8 +287,9 @@ KFR_SINTRIN internal::generator_cossin<TF> gen_cossin(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_SINTRIN internal::generator_sin<TF> gen_sin(T1 start, T2 step) +KFR_FUNCTION internal::generator_sin<TF> gen_sin(T1 start, T2 step) { return internal::generator_sin<TF>(start, step); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/horizontal.hpp b/include/kfr/base/horizontal.hpp @@ -1,119 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "operators.hpp" - -namespace kfr -{ - -namespace internal -{ - -template <typename T, typename ReduceFn> -CMT_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&) -{ - return T(value[0]); -} - -template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))> -CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce) -{ - return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce)); -} -template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))> -CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce) -{ - const T initial = reduce(initialvalue<T>()); - return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce)); -} -} // namespace internal - -template <typename T, size_t N, typename ReduceFn> -CMT_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce) -{ - return internal::horizontal_impl(value, std::forward<ReduceFn>(reduce)); -} - -/// @brief Sum all elements of the vector -template <typename T, size_t N> -CMT_INLINE T hadd(const vec<T, N>& value) -{ - return horizontal(value, fn::add()); -} -KFR_FN(hadd) - -/// @brief Multiply all elements of the vector -template <typename T, size_t N> -CMT_INLINE T hmul(const vec<T, N>& value) -{ - return horizontal(value, fn::mul()); -} -KFR_FN(hmul) - -template <typename T, size_t N> -CMT_INLINE T hbitwiseand(const vec<T, N>& value) -{ - return horizontal(value, fn::bitwiseand()); -} -KFR_FN(hbitwiseand) -template <typename T, size_t N> -CMT_INLINE T hbitwiseor(const vec<T, N>& value) -{ - return horizontal(value, fn::bitwiseor()); -} -KFR_FN(hbitwiseor) -template <typename T, size_t N> -CMT_INLINE T hbitwisexor(const vec<T, N>& value) -{ - return horizontal(value, fn::bitwisexor()); -} -KFR_FN(hbitwisexor) - -/// @brief Calculate the Dot-Product of two vectors -template <typename T, size_t N> -CMT_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y) -{ - return hadd(x * y); -} -KFR_FN(dot) - -/// @brief Calculate the Arithmetic mean of all elements in the vector -template <typename T, size_t N> -CMT_INLINE T avg(const vec<T, N>& value) -{ - return hadd(value) / N; -} -KFR_FN(avg) - -/// @brief Calculate the RMS of all elements in the vector -template <typename T, size_t N> -CMT_INLINE T rms(const vec<T, N>& value) -{ - return internal::builtin_sqrt(hadd(value * value) / N); -} -KFR_FN(rms) -} // namespace kfr diff --git a/include/kfr/base/hyperbolic.hpp b/include/kfr/base/hyperbolic.hpp @@ -1,120 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/hyperbolic.hpp" - -namespace kfr -{ - -/// @brief Returns the hyperbolic sine of the x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sinh(const T1& x) -{ - return intrinsics::sinh(x); -} - -/// @brief Returns template expression that returns the hyperbolic sine of the x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sinh, E1> sinh(E1&& x) -{ - return { fn::sinh(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic cosine of the x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> cosh(const T1& x) -{ - return intrinsics::cosh(x); -} - -/// @brief Returns template expression that returns the hyperbolic cosine of the x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cosh, E1> cosh(E1&& x) -{ - return { fn::cosh(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic tangent of the x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> tanh(const T1& x) -{ - return intrinsics::tanh(x); -} - -/// @brief Returns template expression that returns the hyperbolic tangent of the x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::tanh, E1> tanh(E1&& x) -{ - return { fn::tanh(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic cotangent of the x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> coth(const T1& x) -{ - return intrinsics::coth(x); -} - -/// @brief Returns template expression that returns the hyperbolic cotangent of the x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::coth, E1> coth(E1&& x) -{ - return { fn::coth(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic sine of the even elements of the x and the hyperbolic cosine of the odd -/// elements of the x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sinhcosh(const T1& x) -{ - return intrinsics::sinhcosh(x); -} - -/// @brief Returns template expression that returns the hyperbolic sine of the even elements of the x and the -/// hyperbolic cosine of the odd elements of the x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sinhcosh, E1> sinhcosh(E1&& x) -{ - return { fn::sinhcosh(), std::forward<E1>(x) }; -} - -/// @brief Returns the hyperbolic cosine of the even elements of the x and the hyperbolic sine of the odd -/// elements of the x -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> coshsinh(const T1& x) -{ - return intrinsics::coshsinh(x); -} - -/// @brief Returns template expression that returns the hyperbolic cosine of the even elements of the x and -/// the hyperbolic sine of the odd elements of the x -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::coshsinh, E1> coshsinh(E1&& x) -{ - return { fn::coshsinh(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/impl/abs.hpp b/include/kfr/base/impl/abs.hpp @@ -1,126 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../function.hpp" -#include "../operators.hpp" -#include "../select.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS - -// floating point -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) -{ - return x & constants<T>::invhighbitmask(); -} - -KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); } -KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); } -KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); } -KFR_SINTRIN i8sse abs(const i8sse& x) { return _mm_abs_epi8(*x); } -KFR_SINTRIN u64sse abs(const u64sse& x) { return x; } -KFR_SINTRIN u32sse abs(const u32sse& x) { return x; } -KFR_SINTRIN u16sse abs(const u16sse& x) { return x; } -KFR_SINTRIN u8sse abs(const u8sse& x) { return x; } - -#if defined CMT_ARCH_AVX2 -KFR_SINTRIN i64avx abs(const i64avx& x) { return select(x >= 0, x, -x); } -KFR_SINTRIN i32avx abs(const i32avx& x) { return _mm256_abs_epi32(*x); } -KFR_SINTRIN i16avx abs(const i16avx& x) { return _mm256_abs_epi16(*x); } -KFR_SINTRIN i8avx abs(const i8avx& x) { return _mm256_abs_epi8(*x); } -KFR_SINTRIN u64avx abs(const u64avx& x) { return x; } -KFR_SINTRIN u32avx abs(const u32avx& x) { return x; } -KFR_SINTRIN u16avx abs(const u16avx& x) { return x; } -KFR_SINTRIN u8avx abs(const u8avx& x) { return x; } -#endif - -#if defined CMT_ARCH_AVX512 -KFR_SINTRIN i64avx512 abs(const i64avx512& x) { return select(x >= 0, x, -x); } -KFR_SINTRIN i32avx512 abs(const i32avx512& x) { return _mm512_abs_epi32(*x); } -KFR_SINTRIN i16avx512 abs(const i16avx512& x) { return _mm512_abs_epi16(*x); } -KFR_SINTRIN i8avx512 abs(const i8avx512& x) { return _mm512_abs_epi8(*x); } -KFR_SINTRIN u64avx512 abs(const u64avx512& x) { return x; } -KFR_SINTRIN u32avx512 abs(const u32avx512& x) { return x; } -KFR_SINTRIN u16avx512 abs(const u16avx512& x) { return x; } -KFR_SINTRIN u8avx512 abs(const u8avx512& x) { return x; } -#endif - -KFR_HANDLE_ALL_SIZES_NOT_F_1(abs) - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); } -KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); } -KFR_SINTRIN i32neon abs(const i32neon& x) { return vabsq_s32(*x); } -#if defined CMT_ARCH_NEON64 -KFR_SINTRIN i64neon abs(const i64neon& x) { return vabsq_s64(*x); } -#else -KFR_SINTRIN i64neon abs(const i64neon& x) { return select(x >= 0, x, -x); } -#endif - -KFR_SINTRIN u8neon abs(const u8neon& x) { return x; } -KFR_SINTRIN u16neon abs(const u16neon& x) { return x; } -KFR_SINTRIN u32neon abs(const u32neon& x) { return x; } -KFR_SINTRIN u64neon abs(const u64neon& x) { return x; } - -KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); } -#if defined CMT_ARCH_NEON64 -KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); } -#else -KFR_SINTRIN f64neon abs(const f64neon& x) { return x & constants<f64>::invhighbitmask(); } -#endif - -KFR_HANDLE_ALL_SIZES_1(abs) - -#else - -// floating point -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) -{ - return x & constants<T>::invhighbitmask(); -} - -// fallback -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) -{ - return select(x >= T(0), x, -x); -} -#endif -KFR_I_CONVERTER(abs) -} // namespace intrinsics - -KFR_I_FN(abs) - -} // namespace kfr diff --git a/include/kfr/base/impl/asin_acos.hpp b/include/kfr/base/impl/asin_acos.hpp @@ -1,58 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../atan.hpp" -#include "../function.hpp" -#include "../select.hpp" -#include "../sqrt.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x) -{ - const vec<Tout, N> xx = x; - return atan2(xx, sqrt(Tout(1) - xx * xx)); -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x) -{ - const vec<Tout, N> xx = x; - return -atan2(xx, sqrt(Tout(1) - xx * xx)) + constants<Tout>::pi * 0.5; -} -KFR_I_FLT_CONVERTER(asin) -KFR_I_FLT_CONVERTER(acos) -} // namespace intrinsics -KFR_I_FN(asin) -KFR_I_FN(acos) - -} // namespace kfr diff --git a/include/kfr/base/impl/atan.hpp b/include/kfr/base/impl/atan.hpp @@ -1,229 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "../abs.hpp" -#include "../constants.hpp" -#include "../function.hpp" -#include "../operators.hpp" -#include "../select.hpp" -#include "../sin_cos.hpp" - -namespace kfr -{ -namespace intrinsics -{ -template <size_t N> -KFR_SINTRIN vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx) -{ - vec<f32, N> x = xx, y = yy; - vec<f32, N> s, t, u; - vec<i32, N> q; - q = select(x < 0, -2, 0); - x = select(x < 0, -x, x); - mask<i32, N> m; - m = y > x; - t = x; - x = select(m, y, x); - y = select(m, -t, y); - q = select(m, q + 1, q); - s = y / x; - t = s * s; - u = 0.00282363896258175373077393f; - u = fmadd(u, t, -0.0159569028764963150024414f); - u = fmadd(u, t, 0.0425049886107444763183594f); - u = fmadd(u, t, -0.0748900920152664184570312f); - u = fmadd(u, t, 0.106347933411598205566406f); - u = fmadd(u, t, -0.142027363181114196777344f); - u = fmadd(u, t, 0.199926957488059997558594f); - u = fmadd(u, t, -0.333331018686294555664062f); - t = u * t * s + s; - t = cast<f32>(q) * 1.5707963267948966192313216916398f + t; - return t; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx) -{ - vec<f64, N> x = xx, y = yy; - vec<f64, N> s, t, u; - vec<i64, N> q; - q = select(x < 0, i64(-2), i64(0)); - x = select(x < 0, -x, x); - mask<i64, N> m; - m = y > x; - t = x; - x = select(m, y, x); - y = select(m, -t, y); - q = select(m, q + i64(1), q); - s = y / x; - t = s * s; - u = -1.88796008463073496563746e-05; - u = fmadd(u, t, 0.000209850076645816976906797); - u = fmadd(u, t, -0.00110611831486672482563471); - u = fmadd(u, t, 0.00370026744188713119232403); - u = fmadd(u, t, -0.00889896195887655491740809); - u = fmadd(u, t, 0.016599329773529201970117); - u = fmadd(u, t, -0.0254517624932312641616861); - u = fmadd(u, t, 0.0337852580001353069993897); - u = fmadd(u, t, -0.0407629191276836500001934); - u = fmadd(u, t, 0.0466667150077840625632675); - u = fmadd(u, t, -0.0523674852303482457616113); - u = fmadd(u, t, 0.0587666392926673580854313); - u = fmadd(u, t, -0.0666573579361080525984562); - u = fmadd(u, t, 0.0769219538311769618355029); - u = fmadd(u, t, -0.090908995008245008229153); - u = fmadd(u, t, 0.111111105648261418443745); - u = fmadd(u, t, -0.14285714266771329383765); - u = fmadd(u, t, 0.199999999996591265594148); - u = fmadd(u, t, -0.333333333333311110369124); - t = u * t * s + s; - t = cast<f64>(q) * 1.5707963267948966192313216916398 + t; - return t; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x) -{ - vec<f32, N> r = atan2k(abs(y), x); - constexpr f32 pi = 3.1415926535897932384626433832795f; - constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f; - constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f; - r = mulsign(r, x); - r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r); - r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r); - r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r); - r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y); - return r; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x) -{ - vec<f64, N> r = atan2k(abs(y), x); - constexpr f64 pi = 3.1415926535897932384626433832795; - constexpr f64 pi_over_2 = 1.5707963267948966192313216916398; - constexpr f64 pi_over_4 = 0.78539816339744830961566084581988; - r = mulsign(r, x); - r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r); - r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r); - r = select(y == 0.0, select(x < 0., pi, 0.), r); - r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y); - return r; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> atan(const vec<f32, N>& x) -{ - vec<f32, N> t, u; - vec<i32, N> q; - q = select(x < 0.f, 2, 0); - vec<f32, N> s = select(x < 0.f, -x, x); - q = select(s > 1.f, q | 1, q); - s = select(s > 1.f, 1.0f / s, s); - t = s * s; - u = 0.00282363896258175373077393f; - u = fmadd(u, t, -0.0159569028764963150024414f); - u = fmadd(u, t, 0.0425049886107444763183594f); - u = fmadd(u, t, -0.0748900920152664184570312f); - u = fmadd(u, t, 0.106347933411598205566406f); - u = fmadd(u, t, -0.142027363181114196777344f); - u = fmadd(u, t, 0.199926957488059997558594f); - u = fmadd(u, t, -0.333331018686294555664062f); - t = s + s * (t * u); - t = select((q & 1) != 0, 1.570796326794896557998982f - t, t); - t = select((q & 2) != 0, -t, t); - return t; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> atan(const vec<f64, N>& x) -{ - vec<f64, N> t, u; - vec<i64, N> q; - q = select(x < 0.0, i64(2), i64(0)); - vec<f64, N> s = select(x < 0.0, -x, x); - q = select(s > 1.0, q | 1, q); - s = select(s > 1.0, 1.0 / s, s); - t = s * s; - u = -1.88796008463073496563746e-05; - u = fmadd(u, t, 0.000209850076645816976906797); - u = fmadd(u, t, -0.00110611831486672482563471); - u = fmadd(u, t, 0.00370026744188713119232403); - u = fmadd(u, t, -0.00889896195887655491740809); - u = fmadd(u, t, 0.016599329773529201970117); - u = fmadd(u, t, -0.0254517624932312641616861); - u = fmadd(u, t, 0.0337852580001353069993897); - u = fmadd(u, t, -0.0407629191276836500001934); - u = fmadd(u, t, 0.0466667150077840625632675); - u = fmadd(u, t, -0.0523674852303482457616113); - u = fmadd(u, t, 0.0587666392926673580854313); - u = fmadd(u, t, -0.0666573579361080525984562); - u = fmadd(u, t, 0.0769219538311769618355029); - u = fmadd(u, t, -0.090908995008245008229153); - u = fmadd(u, t, 0.111111105648261418443745); - u = fmadd(u, t, -0.14285714266771329383765); - u = fmadd(u, t, 0.199999999996591265594148); - u = fmadd(u, t, -0.333333333333311110369124); - t = s + s * (t * u); - t = select((q & 1) != 0, 1.570796326794896557998982 - t, t); - t = select((q & 2) != 0, -t, t); - return t; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> atandeg(const vec<f32, N>& x) -{ - return atan(x) * c_radtodeg<f32>; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> atandeg(const vec<f64, N>& x) -{ - return atan(x) * c_radtodeg<f64>; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> atan2deg(const vec<f32, N>& y, const vec<f32, N>& x) -{ - return atan2(y, x) * c_radtodeg<f32>; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> atan2deg(const vec<f64, N>& y, const vec<f64, N>& x) -{ - return atan2(y, x) * c_radtodeg<f64>; -} - -KFR_I_FLT_CONVERTER(atan) -KFR_I_FLT_CONVERTER(atan2) -KFR_I_FLT_CONVERTER(atandeg) -KFR_I_FLT_CONVERTER(atan2deg) -} // namespace intrinsics -KFR_I_FN(atan) -KFR_I_FN(atandeg) -KFR_I_FN(atan2) -KFR_I_FN(atan2deg) -} // namespace kfr diff --git a/include/kfr/base/impl/clamp.hpp b/include/kfr/base/impl/clamp.hpp @@ -1,56 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../min_max.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -template <typename T> -KFR_SINTRIN T clamp(const T& x, const T& lo, const T& hi) -{ - return max(min(x, hi), lo); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi) -{ - return max(min(x, hi), lo); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi) -{ - return max(min(x, hi), zerovector<T, N>()); -} -} // namespace intrinsics -KFR_I_FN(clamp) - -} // namespace kfr diff --git a/include/kfr/base/impl/gamma.hpp b/include/kfr/base/impl/gamma.hpp @@ -1,72 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "../function.hpp" -#include "../log_exp.hpp" - -CMT_PRAGMA_GNU(GCC diagnostic push) -#if CMT_HAS_WARNING("-Wc99-extensions") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") -#endif - -namespace kfr -{ - -namespace intrinsics -{ -template <typename T> -constexpr T gamma_precalc[] = { - 0x2.81b263fec4e08p+0, 0x3.07b4100e04448p+16, -0xa.a0da01d4d4e2p+16, 0xf.05ccb27bb9dbp+16, - -0xa.fa79616b7c6ep+16, 0x4.6dd6c10d4df5p+16, -0xf.a2304199eb4ap+12, 0x1.c21dd4aade3dp+12, - -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0, -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24, -}; - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> gamma(const vec<T, N>& z) -{ - constexpr size_t Count = arraysize(gamma_precalc<T>); - vec<T, N> accm = gamma_precalc<T>[0]; - CMT_LOOP_UNROLL - for (size_t k = 1; k < Count; k++) - accm += gamma_precalc<T>[k] / (z + cast<utype<T>>(k)); - accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5); - return accm / z; -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> factorial_approx(const vec<T, N>& x) -{ - return gamma(x + T(1)); -} -KFR_I_FLT_CONVERTER(gamma) -KFR_I_FLT_CONVERTER(factorial_approx) -} // namespace intrinsics -KFR_I_FN(gamma) -KFR_I_FN(factorial_approx) - -} // namespace kfr - -CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/impl/hyperbolic.hpp b/include/kfr/base/impl/hyperbolic.hpp @@ -1,100 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../abs.hpp" -#include "../constants.hpp" -#include "../function.hpp" -#include "../log_exp.hpp" -#include "../min_max.hpp" -#include "../operators.hpp" -#include "../select.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sinh(const vec<T, N>& x) -{ - const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x); - return (exp(xx) - exp(-xx)) * Tout(0.5); -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> cosh(const vec<T, N>& x) -{ - const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x); - return (exp(xx) + exp(-xx)) * Tout(0.5); -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> tanh(const vec<T, N>& x) -{ - const vec<Tout, N> a = exp(2 * x); - return (a - 1) / (a + 1); -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> coth(const vec<T, N>& x) -{ - const vec<Tout, N> a = exp(2 * x); - return (a + 1) / (a - 1); -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sinhcosh(const vec<T, N>& x) -{ - const vec<Tout, N> a = exp(x); - const vec<Tout, N> b = exp(-x); - return subadd(a, b) * Tout(0.5); -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> coshsinh(const vec<T, N>& x) -{ - const vec<Tout, N> a = exp(x); - const vec<Tout, N> b = exp(-x); - return addsub(a, b) * Tout(0.5); -} - -KFR_I_FLT_CONVERTER(sinh) -KFR_I_FLT_CONVERTER(cosh) -KFR_I_FLT_CONVERTER(tanh) -KFR_I_FLT_CONVERTER(coth) -KFR_I_FLT_CONVERTER(sinhcosh) -KFR_I_FLT_CONVERTER(coshsinh) -} // namespace intrinsics -KFR_I_FN(sinh) -KFR_I_FN(cosh) -KFR_I_FN(tanh) -KFR_I_FN(coth) -KFR_I_FN(sinhcosh) -KFR_I_FN(coshsinh) - -} // namespace kfr diff --git a/include/kfr/base/impl/log_exp.hpp b/include/kfr/base/impl/log_exp.hpp @@ -1,315 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../abs.hpp" -#include "../clamp.hpp" -#include "../constants.hpp" -#include "../function.hpp" -#include "../min_max.hpp" -#include "../operators.hpp" -#include "../round.hpp" -#include "../select.hpp" -#include "../shuffle.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -template <size_t N> -KFR_SINTRIN vec<i32, N> vilogbp1(const vec<f32, N>& d) -{ - mask<i32, N> m = d < 5.421010862427522E-20f; - vec<i32, N> q = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff; - q = select(m, q - (64 + 0x7e), q - 0x7e); - return q; -} - -template <size_t N> -KFR_SINTRIN vec<i64, N> vilogbp1(const vec<f64, N>& d) -{ - mask<i64, N> m = d < 4.9090934652977266E-91; - vec<i64, N> q = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff; - q = select(m, q - (300 + 0x03fe), q - 0x03fe); - return q; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q) -{ - vec<i32, N> m = q >> 31; - m = (((m + q) >> 6) - m) << 4; - const vec<i32, N> qq = q - (m << 2); - m = clamp(m + 0x7f, vec<i32, N>(0xff)); - vec<f32, N> u = pow4(bitcast<f32>(cast<i32>(m) << 23)); - return x * u * bitcast<f32>((cast<i32>(qq + 0x7f)) << 23); -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q) -{ - vec<i64, N> m = q >> 31; - m = (((m + q) >> 9) - m) << 7; - const vec<i64, N> qq = q - (m << 2); - m = clamp(m + 0x3ff, i64(0x7ff)); - vec<f64, N> u = pow4(bitcast<f64>(cast<i64>(m) << 52)); - return x * u * bitcast<f64>((cast<i64>(qq + 0x3ff)) << 52); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x) -{ - return select(x == T(), -c_infinity<T>, static_cast<vec<T, N>>(vilogbp1(x) - 1)); -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d) -{ - vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f ); - vec<f32, N> m = vldexpk(d, -e); - - vec<f32, N> x = (m - 1.0f) / (m + 1.0f); - vec<f32, N> x2 = x * x; - - vec<f32, N> sp = select(d < 0, constants<f32>::qnan, constants<f32>::neginfinity); - - vec<f32, N> t = 0.2371599674224853515625f; - t = fmadd(t, x2, 0.285279005765914916992188f); - t = fmadd(t, x2, 0.400005519390106201171875f); - t = fmadd(t, x2, 0.666666567325592041015625f); - t = fmadd(t, x2, 2.0f); - - x = x * t + c_log_2<f32> * cast<f32>(e); - x = select(d > 0, x, sp); - - return x; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d) -{ - vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 ); - vec<f64, N> m = vldexpk(d, -e); - - vec<f64, N> x = (m - 1.0) / (m + 1.0); - vec<f64, N> x2 = x * x; - - vec<f64, N> sp = select(d < 0, constants<f64>::qnan, constants<f64>::neginfinity); - - vec<f64, N> t = 0.148197055177935105296783; - t = fmadd(t, x2, 0.153108178020442575739679); - t = fmadd(t, x2, 0.181837339521549679055568); - t = fmadd(t, x2, 0.22222194152736701733275); - t = fmadd(t, x2, 0.285714288030134544449368); - t = fmadd(t, x2, 0.399999999989941956712869); - t = fmadd(t, x2, 0.666666666666685503450651); - t = fmadd(t, x2, 2); - - x = x * t + constants<f64>::log_2 * cast<f64>(e); - x = select(d > 0, x, sp); - - return x; -} - -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x) -{ - return log(cast<Tout>(x)) * constants<Tout>::recip_log_2; -} -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x) -{ - return log(cast<Tout>(x)) * constants<Tout>::recip_log_10; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d) -{ - const f32 ln2_part1 = 0.6931457519f; - const f32 ln2_part2 = 1.4286067653e-6f; - - vec<i32, N> q = cast<i32>(floor(d * constants<f32>::recip_log_2)); - vec<f32, N> s, u; - - s = fmadd(cast<f32>(q), -ln2_part1, d); - s = fmadd(cast<f32>(q), -ln2_part2, s); - - const f32 c2 = 0.4999999105930328369140625f; - const f32 c3 = 0.166668415069580078125f; - const f32 c4 = 4.16539050638675689697265625e-2f; - const f32 c5 = 8.378830738365650177001953125e-3f; - const f32 c6 = 1.304379315115511417388916015625e-3f; - const f32 c7 = 2.7555381529964506626129150390625e-4f; - - u = c7; - u = fmadd(u, s, c6); - u = fmadd(u, s, c5); - u = fmadd(u, s, c4); - u = fmadd(u, s, c3); - u = fmadd(u, s, c2); - - u = s * s * u + s + 1.0f; - u = vldexpk(u, q); - - u = select(d == constants<f32>::neginfinity, 0.f, u); - - return u; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d) -{ - const f64 ln2_part1 = 0.69314717501401901245; - const f64 ln2_part2 = 5.545926273775592108e-009; - - vec<i64, N> q = cast<i64>(floor(d * +constants<f64>::recip_log_2)); - vec<f64, N> s, u; - - s = fmadd(cast<f64>(q), -ln2_part1, d); - s = fmadd(cast<f64>(q), -ln2_part2, s); - - const f64 c2 = 0.499999999999994948485237955537741072475910186767578; - const f64 c3 = 0.166666666667024204739888659787538927048444747924805; - const f64 c4 = 4.16666666578945840693215529881854308769106864929199e-2; - const f64 c5 = 8.3333334397461874404333670440792047884315252304077e-3; - const f64 c6 = 1.3888881489747750223179290074426717183087021112442e-3; - const f64 c7 = 1.9841587032493949419205414574918222569976933300495e-4; - const f64 c8 = 2.47929324077393282239802768662784160369483288377523e-5; - const f64 c9 = 2.77076037925831049422552981864598109496000688523054e-6; - const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7; - const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8; - - u = c11; - u = fmadd(u, s, c10); - u = fmadd(u, s, c9); - u = fmadd(u, s, c8); - u = fmadd(u, s, c7); - u = fmadd(u, s, c6); - u = fmadd(u, s, c5); - u = fmadd(u, s, c4); - u = fmadd(u, s, c3); - u = fmadd(u, s, c2); - - u = s * s * u + s + 1.0; - u = vldexpk(u, q); - - u = select(d == constants<f64>::neginfinity, 0.0, u); - - return u; -} -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x) -{ - return exp(x * constants<Tout>::log_2); -} -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x) -{ - return exp(x * constants<Tout>::log_10); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b) -{ - const vec<T, N> t = exp(b * log(abs(a))); - const mask<T, N> isint = floor(b) == b; - const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0; - return select( - a > T(), t, - select(a == T(), T(), select(isint, select(iseven, t, -t), broadcast<N>(constants<T>::qnan)))); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b) -{ - return exp(reciprocal(b) * log(x)); -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cbrt(const vec<T, N>& x) -{ - return pow<T, N>(x, T(0.333333333333333333333333333333333)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> cbrt(const vec<T, N>& x) -{ - return cbrt(cast<Tout>(x)); -} - -KFR_I_FLT_CONVERTER(exp) -KFR_I_FLT_CONVERTER(exp2) -KFR_I_FLT_CONVERTER(exp10) -KFR_I_FLT_CONVERTER(log) -KFR_I_FLT_CONVERTER(log2) -KFR_I_FLT_CONVERTER(log10) -KFR_I_FLT_CONVERTER(logb) -KFR_I_FLT_CONVERTER(pow) -KFR_I_FLT_CONVERTER(root) -KFR_I_FLT_CONVERTER(cbrt) - -template <typename T1, typename T2> -KFR_SINTRIN flt_type<common_type<T1, T2>> logn(const T1& a, const T2& b) -{ - return log(a) / log(b); -} - -template <typename T1, typename T2> -KFR_SINTRIN flt_type<common_type<T1, T2>> logm(const T1& a, const T2& b) -{ - return log(a) * b; -} - -template <typename T1, typename T2, typename T3> -KFR_SINTRIN flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a) -{ - return exp(fmadd(x, m, a)); -} - -template <typename T1, typename T2, typename T3> -KFR_SINTRIN flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a) -{ - return fmadd(log(x), m, a); -} -} // namespace intrinsics -KFR_I_FN(exp) -KFR_I_FN(exp2) -KFR_I_FN(exp10) -KFR_I_FN(log) -KFR_I_FN(log2) -KFR_I_FN(log10) -KFR_I_FN(logb) -KFR_I_FN(logn) -KFR_I_FN(logm) -KFR_I_FN(exp_fmadd) -KFR_I_FN(log_fmadd) -KFR_I_FN(pow) -KFR_I_FN(root) -KFR_I_FN(cbrt) - -} // namespace kfr diff --git a/include/kfr/base/impl/logical.hpp b/include/kfr/base/impl/logical.hpp @@ -1,289 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../abs.hpp" -#include "../function.hpp" -#include "../operators.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -template <size_t bits> -struct bitmask -{ - using type = conditional<(bits > 32), uint64_t, - conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>; - - bitmask(type val) : value(val) {} - - template <typename Itype> - bitmask(Itype val) : value(static_cast<type>(val)) - { - } - - type value; -}; - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -#if defined CMT_ARCH_SSE41 - -// horizontal OR -KFR_SINTRIN bool bittestany(const u8sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const u16sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const u32sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const u64sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const i8sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const i16sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const i32sse& x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(const i64sse& x) { return !_mm_testz_si128(*x, *x); } - -// horizontal AND -KFR_SINTRIN bool bittestall(const u8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const u16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const u32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const u64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } -#endif - -#if defined CMT_ARCH_AVX -// horizontal OR -KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_ps(*x, *x); } -KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_pd(*x, *x); } - -KFR_SINTRIN bool bittestany(const f32avx& x) { return !_mm256_testz_ps(*x, *x); } -KFR_SINTRIN bool bittestany(const f64avx& x) { return !_mm256_testz_pd(*x, *x); } - -KFR_SINTRIN bool bittestany(const u8avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const u16avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const u32avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const u64avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const i8avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const i16avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const i32avx& x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(const i64avx& x) { return !_mm256_testz_si256(*x, *x); } - -// horizontal AND -KFR_SINTRIN bool bittestall(const f32sse& x) { return _mm_testc_ps(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const f64sse& x) { return _mm_testc_pd(*x, *allonesvector(x)); } - -KFR_SINTRIN bool bittestall(const f32avx& x) { return _mm256_testc_ps(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const f64avx& x) { return _mm256_testc_pd(*x, *allonesvector(x)); } - -KFR_SINTRIN bool bittestall(const u8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const u16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const u32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const u64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } - -#if defined CMT_ARCH_AVX512 -// horizontal OR -KFR_SINTRIN bool bittestany(const f32avx512& x) { return _mm512_test_epi32_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const f64avx512& x) { return _mm512_test_epi64_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u8avx512& x) { return _mm512_test_epi8_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u16avx512& x) { return _mm512_test_epi16_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u32avx512& x) { return _mm512_test_epi32_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u64avx512& x) { return _mm512_test_epi64_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i8avx512& x) { return _mm512_test_epi8_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i16avx512& x) { return _mm512_test_epi16_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i32avx512& x) { return _mm512_test_epi32_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i64avx512& x) { return _mm512_test_epi64_mask(*x, *x); } - -// horizontal AND -KFR_SINTRIN bool bittestall(const f32avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const f64avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const u8avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const u16avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const u32avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const u64avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const i8avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const i16avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const i32avx512& x) { return !bittestany(~x); } -KFR_SINTRIN bool bittestall(const i64avx512& x) { return !bittestany(~x); } - -#endif - -#elif defined CMT_ARCH_SSE41 -KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); } -KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); } -KFR_SINTRIN bool bittestall(const f32sse& x) -{ - return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); -} -KFR_SINTRIN bool bittestall(const f64sse& x) -{ - return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); -} -#endif - -#if !defined CMT_ARCH_SSE41 - -KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); } -KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); } -KFR_SINTRIN bool bittestany(const u8sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const u16sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const u32sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const u64sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const i8sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const i16sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const i32sse& x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(const i64sse& x) { return _mm_movemask_epi8(*x); } - -KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); } -KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); } -KFR_SINTRIN bool bittestall(const u8sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const u16sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const u32sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const u64sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const i8sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const i16sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); } -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> -KFR_SINTRIN bool bittestall(const vec<T, N>& a) -{ - return bittestall(expand_simd(a, internal::maskbits<T>(true))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> -KFR_SINTRIN bool bittestall(const vec<T, N>& a) -{ - return bittestall(low(a)) && bittestall(high(a)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> -KFR_SINTRIN bool bittestany(const vec<T, N>& a) -{ - return bittestany(expand_simd(a, internal::maskbits<T>(false))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> -KFR_SINTRIN bool bittestany(const vec<T, N>& a) -{ - return bittestany(low(a)) || bittestany(high(a)); -} - -#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN bool bittestall(const u32neon& a) -{ - const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a)); - return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; -} - -KFR_SINTRIN bool bittestany(const u32neon& a) -{ - const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a)); - return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; -} -KFR_SINTRIN bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); } - -KFR_SINTRIN bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); } -KFR_SINTRIN bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); } - -template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> -KFR_SINTRIN bool bittestall(const vec<T, N>& a) -{ - return bittestall(expand_simd(a, internal::maskbits<T>(true))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> -KFR_SINTRIN bool bittestall(const vec<T, N>& a) -{ - return bittestall(low(a)) && bittestall(high(a)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> -KFR_SINTRIN bool bittestany(const vec<T, N>& a) -{ - return bittestany(expand_simd(a, internal::maskbits<T>(false))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> -KFR_SINTRIN bool bittestany(const vec<T, N>& a) -{ - return bittestany(low(a)) || bittestany(high(a)); -} - -#else - -template <typename T, size_t N> -KFR_SINTRIN bitmask<N> getmask(const vec<T, N>& x) -{ - typename bitmask<N>::type val = 0; - for (size_t i = 0; i < N; i++) - { - val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i; - } - return val; -} - -template <typename T, size_t N> -KFR_SINTRIN bool bittestany(const vec<T, N>& x) -{ - return getmask(x).value; -} -template <typename T, size_t N> -KFR_SINTRIN bool bittestany(const vec<T, N>& x, const vec<T, N>& y) -{ - return bittestany(x & y); -} - -template <typename T, size_t N> -KFR_SINTRIN bool bittestall(const vec<T, N>& x) -{ - return !getmask(~x).value; -} -template <typename T, size_t N> -KFR_SINTRIN bool bittestall(const vec<T, N>& x, const vec<T, N>& y) -{ - return !bittestany(~x & y); -} -#endif -} // namespace intrinsics - -} // namespace kfr diff --git a/include/kfr/base/impl/min_max.hpp b/include/kfr/base/impl/min_max.hpp @@ -1,232 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../abs.hpp" -#include "../function.hpp" -#include "../operators.hpp" -#include "../select.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); } -KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); } -KFR_SINTRIN u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(*x, *y); } -KFR_SINTRIN i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(*x, *y); } - -KFR_SINTRIN f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(*x, *y); } -KFR_SINTRIN f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(*x, *y); } -KFR_SINTRIN u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(*x, *y); } -KFR_SINTRIN i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(*x, *y); } - -#if defined CMT_ARCH_AVX2 -KFR_SINTRIN u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(*x, *y); } -KFR_SINTRIN i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(*x, *y); } -KFR_SINTRIN i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(*x, *y); } -KFR_SINTRIN u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(*x, *y); } -KFR_SINTRIN i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(*x, *y); } -KFR_SINTRIN u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(*x, *y); } - -KFR_SINTRIN u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(*x, *y); } -KFR_SINTRIN i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(*x, *y); } -KFR_SINTRIN i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(*x, *y); } -KFR_SINTRIN u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(*x, *y); } -KFR_SINTRIN i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(*x, *y); } -KFR_SINTRIN u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(*x, *y); } - -#endif - -#if defined CMT_ARCH_AVX512 -KFR_SINTRIN u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(*x, *y); } -KFR_SINTRIN i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(*x, *y); } -KFR_SINTRIN i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(*x, *y); } -KFR_SINTRIN u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(*x, *y); } -KFR_SINTRIN i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(*x, *y); } -KFR_SINTRIN u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(*x, *y); } -KFR_SINTRIN u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(*x, *y); } -KFR_SINTRIN i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(*x, *y); } -KFR_SINTRIN i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(*x, *y); } -KFR_SINTRIN u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(*x, *y); } -KFR_SINTRIN i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(*x, *y); } -KFR_SINTRIN u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(*x, *y); } -KFR_SINTRIN i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(*x, *y); } -KFR_SINTRIN u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(*x, *y); } -KFR_SINTRIN i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(*x, *y); } -KFR_SINTRIN u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(*x, *y); } - -KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(*x, *y); } -KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(*x, *y); } -KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(*x, *y); } -KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(*x, *y); } - -KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(*x, *y); } -KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(*x, *y); } -KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(*x, *y); } -KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(*x, *y); } -#else -KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); } -KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); } -KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); } -KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); } -KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); } -KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); } -KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); } -KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); } -#endif - -#if defined CMT_ARCH_AVX -KFR_SINTRIN f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(*x, *y); } -KFR_SINTRIN f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(*x, *y); } -KFR_SINTRIN f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(*x, *y); } -KFR_SINTRIN f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(*x, *y); } -#endif - -#if defined CMT_ARCH_SSE41 -KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(*x, *y); } -KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(*x, *y); } -KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(*x, *y); } -KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(*x, *y); } - -KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(*x, *y); } -KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(*x, *y); } -KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(*x, *y); } -KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(*x, *y); } -#else -KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); } -KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); } -KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); } -KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); } - -KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); } -KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); } -KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); } -KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); } - -#endif - -KFR_HANDLE_ALL_SIZES_2(min) -KFR_HANDLE_ALL_SIZES_2(max) - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); } -KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); } -KFR_SINTRIN i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(*x, *y); } -KFR_SINTRIN u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(*x, *y); } -KFR_SINTRIN i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(*x, *y); } -KFR_SINTRIN u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(*x, *y); } -KFR_SINTRIN i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); } -KFR_SINTRIN u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); } - -KFR_SINTRIN i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(*x, *y); } -KFR_SINTRIN u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(*x, *y); } -KFR_SINTRIN i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(*x, *y); } -KFR_SINTRIN u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(*x, *y); } -KFR_SINTRIN i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(*x, *y); } -KFR_SINTRIN u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(*x, *y); } -KFR_SINTRIN i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); } -KFR_SINTRIN u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); } - -KFR_SINTRIN f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(*x, *y); } -KFR_SINTRIN f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(*x, *y); } -#if defined CMT_ARCH_NEON64 -KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(*x, *y); } -KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(*x, *y); } -#else -KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); } -KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); } -#endif - -KFR_HANDLE_ALL_SIZES_2(min) -KFR_HANDLE_ALL_SIZES_2(max) - -#else - -// fallback -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y) -{ - return select(x < y, x, y); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y) -{ - return select(x > y, x, y); -} -#endif - -template <typename T> -KFR_SINTRIN T min(initialvalue<T>) -{ - return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() - : std::numeric_limits<T>::max(); -} -template <typename T> -KFR_SINTRIN T max(initialvalue<T>) -{ - return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity() - : std::numeric_limits<T>::min(); -} -template <typename T> -KFR_SINTRIN T absmin(initialvalue<T>) -{ - return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() - : std::numeric_limits<T>::max(); -} -template <typename T> -KFR_SINTRIN T absmax(initialvalue<T>) -{ - return 0; -} - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y) -{ - return min(abs(x), abs(y)); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y) -{ - return max(abs(x), abs(y)); -} - -KFR_I_CONVERTER(min) -KFR_I_CONVERTER(max) -KFR_I_CONVERTER(absmin) -KFR_I_CONVERTER(absmax) -} // namespace intrinsics -KFR_I_FN(min) -KFR_I_FN(max) -KFR_I_FN(absmin) -KFR_I_FN(absmax) - -} // namespace kfr diff --git a/include/kfr/base/impl/modzerobessel.hpp b/include/kfr/base/impl/modzerobessel.hpp @@ -1,105 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../function.hpp" -#include "../log_exp.hpp" - -CMT_PRAGMA_GNU(GCC diagnostic push) -#if CMT_HAS_WARNING("-Wc99-extensions") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") -#endif - -namespace kfr -{ - -namespace intrinsics -{ - -template <typename T, size_t N> -CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x) -{ - constexpr static T bessel_coef[] = { T(0.25), - T(0.027777777777777776236), - T(0.0017361111111111110147), - T(6.9444444444444444384e-005), - T(1.9290123456790123911e-006), - T(3.9367598891408417495e-008), - T(6.1511873267825652335e-010), - T(7.5940584281266239246e-012), - T(7.5940584281266233693e-014), - T(6.2760813455591932909e-016), - T(4.3583898233049949985e-018), - T(2.5789288895295827557e-020), - T(1.3157800456783586208e-022), - T(5.8479113141260384983e-025), - T(2.2843403570804837884e-027), - T(7.904291893012054025e-030), - T(2.4395962632753252792e-032), - T(6.75788438580422547e-035), - T(1.689471096451056426e-037), - T(3.8310002187098784929e-040), - T(7.9152897080782616517e-043), - T(1.4962740468957016443e-045), - T(2.5976979980828152196e-048), - T(4.1563167969325041577e-051), - T(6.1483976285983795968e-054), - T(8.434015951438105991e-057), - T(1.0757673407446563809e-059), - T(1.2791526049282476926e-062), - T(1.4212806721424974034e-065), - T(1.4789601166935457918e-068), - T(1.4442969889585408123e-071), - T(1.3262598613026086927e-074), - T(1.1472836170437790782e-077), - T(9.3655805472961564331e-081), - T(7.2265282000741942594e-084), - T(5.2786911614858977913e-087), - T(3.6556032974279072401e-090), - T(2.4034209713529963119e-093), - T(1.5021381070956226783e-096) }; - - const vec<T, N> x_2 = x * 0.5; - const vec<T, N> x_2_sqr = x_2 * x_2; - vec<T, N> num = x_2_sqr; - vec<T, N> result; - result = 1 + x_2_sqr; - - CMT_LOOP_UNROLL - for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++) - { - result = fmadd((num *= x_2_sqr), bessel_coef[i], result); - } - return result; -} - -KFR_I_CONVERTER(modzerobessel) -} // namespace intrinsics -KFR_I_FN(modzerobessel) - -} // namespace kfr - -CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/impl/round.hpp b/include/kfr/base/impl/round.hpp @@ -1,255 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../function.hpp" -#include "../operators.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_ss(V) \ - _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_sd(V) \ - _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V)) -#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V)) -#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V)) -#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V)) - -#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#define KFR_mm512_trunc_ps(V) _mm512_roundscale_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm512_roundnearest_ps(V) _mm512_roundscale_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm512_trunc_pd(V) _mm512_roundscale_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm512_roundnearest_pd(V) _mm512_roundscale_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); } -KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); } -KFR_SINTRIN f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(*value); } -KFR_SINTRIN f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(*value); } -KFR_SINTRIN f64sse floor(const f64sse& value) { return _mm_floor_pd(*value); } -KFR_SINTRIN f64sse ceil(const f64sse& value) { return _mm_ceil_pd(*value); } -KFR_SINTRIN f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(*value); } -KFR_SINTRIN f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(*value); } -KFR_SINTRIN f32sse fract(const f32sse& x) { return x - floor(x); } -KFR_SINTRIN f64sse fract(const f64sse& x) { return x - floor(x); } - -#if defined CMT_ARCH_AVX - -KFR_SINTRIN f32avx floor(const f32avx& value) { return _mm256_floor_ps(*value); } -KFR_SINTRIN f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(*value); } -KFR_SINTRIN f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(*value); } -KFR_SINTRIN f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(*value); } -KFR_SINTRIN f64avx floor(const f64avx& value) { return _mm256_floor_pd(*value); } -KFR_SINTRIN f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(*value); } -KFR_SINTRIN f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(*value); } -KFR_SINTRIN f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(*value); } -KFR_SINTRIN f32avx fract(const f32avx& x) { return x - floor(x); } -KFR_SINTRIN f64avx fract(const f64avx& x) { return x - floor(x); } -#endif - -#if defined CMT_ARCH_AVX512 - -KFR_SINTRIN f32avx512 floor(const f32avx512& value) { return _mm512_floor_ps(*value); } -KFR_SINTRIN f32avx512 ceil(const f32avx512& value) { return _mm512_ceil_ps(*value); } -KFR_SINTRIN f32avx512 trunc(const f32avx512& value) { return KFR_mm512_trunc_ps(*value); } -KFR_SINTRIN f32avx512 round(const f32avx512& value) { return KFR_mm512_roundnearest_ps(*value); } -KFR_SINTRIN f64avx512 floor(const f64avx512& value) { return _mm512_floor_pd(*value); } -KFR_SINTRIN f64avx512 ceil(const f64avx512& value) { return _mm512_ceil_pd(*value); } -KFR_SINTRIN f64avx512 trunc(const f64avx512& value) { return KFR_mm512_trunc_pd(*value); } -KFR_SINTRIN f64avx512 round(const f64avx512& value) { return KFR_mm512_roundnearest_pd(*value); } -KFR_SINTRIN f32avx512 fract(const f32avx512& x) { return x - floor(x); } -KFR_SINTRIN f64avx512 fract(const f64avx512& x) { return x - floor(x); } -#endif - -KFR_HANDLE_ALL_SIZES_F_1(floor) -KFR_HANDLE_ALL_SIZES_F_1(ceil) -KFR_HANDLE_ALL_SIZES_F_1(round) -KFR_HANDLE_ALL_SIZES_F_1(trunc) -KFR_HANDLE_ALL_SIZES_F_1(fract) - -#else - -// fallback - -template <size_t N> -KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x) -{ - vec<f32, N> t = cast<f32>(cast<i32>(x)); - return t - select(x < t, 1.f, 0.f); -} -template <size_t N> -KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x) -{ - vec<f64, N> t = cast<f64>(cast<i64>(x)); - return t - select(x < t, 1., 0.); -} -template <size_t N> -KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x) -{ - vec<f32, N> t = cast<f32>(cast<i32>(x)); - return t + select(x > t, 1.f, 0.f); -} -template <size_t N> -KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x) -{ - vec<f64, N> t = cast<f64>(cast<i64>(x)); - return t + select(x > t, 1., 0.); -} -template <size_t N> -KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x) -{ - return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x))); -} -template <size_t N> -KFR_SINTRIN vec<f64, N> round(const vec<f64, N>& x) -{ - return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x))); -} -template <size_t N> -KFR_SINTRIN vec<f32, N> trunc(const vec<f32, N>& x) -{ - return cast<f32>(cast<i32>(x)); -} -template <size_t N> -KFR_SINTRIN vec<f64, N> trunc(const vec<f64, N>& x) -{ - return cast<f64>(cast<i64>(x)); -} -template <size_t N> -KFR_SINTRIN vec<f32, N> fract(const vec<f32, N>& x) -{ - return x - floor(x); -} -template <size_t N> -KFR_SINTRIN vec<f64, N> fract(const vec<f64, N>& x) -{ - return x - floor(x); -} -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> floor(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> ceil(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> trunc(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> round(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> fract(const vec<T, N>&) -{ - return T(0); -} - -template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> ifloor(const vec<T, N>& value) -{ - return cast<IT>(floor(value)); -} -template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> iceil(const vec<T, N>& value) -{ - return cast<IT>(ceil(value)); -} -template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> itrunc(const vec<T, N>& value) -{ - return cast<IT>(trunc(value)); -} -template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> iround(const vec<T, N>& value) -{ - return cast<IT>(round(value)); -} - -KFR_I_CONVERTER(floor) -KFR_I_CONVERTER(ceil) -KFR_I_CONVERTER(round) -KFR_I_CONVERTER(trunc) -KFR_I_CONVERTER(fract) -KFR_I_CONVERTER(ifloor) -KFR_I_CONVERTER(iceil) -KFR_I_CONVERTER(iround) -KFR_I_CONVERTER(itrunc) -} // namespace intrinsics -KFR_I_FN(floor) -KFR_I_FN(ceil) -KFR_I_FN(round) -KFR_I_FN(trunc) -KFR_I_FN(fract) -KFR_I_FN(ifloor) -KFR_I_FN(iceil) -KFR_I_FN(iround) -KFR_I_FN(itrunc) - -} // namespace kfr - -#undef KFR_mm_trunc_ps -#undef KFR_mm_roundnearest_ps -#undef KFR_mm_trunc_pd -#undef KFR_mm_roundnearest_pd -#undef KFR_mm_trunc_ss -#undef KFR_mm_roundnearest_ss -#undef KFR_mm_trunc_sd -#undef KFR_mm_roundnearest_sd -#undef KFR_mm_floor_ss -#undef KFR_mm_floor_sd -#undef KFR_mm_ceil_ss -#undef KFR_mm_ceil_sd -#undef KFR_mm256_trunc_ps -#undef KFR_mm256_roundnearest_ps -#undef KFR_mm256_trunc_pd -#undef KFR_mm256_roundnearest_pd diff --git a/include/kfr/base/impl/saturation.hpp b/include/kfr/base/impl/saturation.hpp @@ -1,192 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../function.hpp" -#include "../select.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -// Generic functions -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b) -{ - using UT = utype<T>; - constexpr size_t shift = typebits<UT>::bits - 1; - vec<UT, N> aa = bitcast<UT>(a); - vec<UT, N> bb = bitcast<UT>(b); - const vec<UT, N> sum = aa + bb; - aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); - - return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= 0, a, bitcast<T>(sum)); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b) -{ - using UT = utype<T>; - constexpr size_t shift = typebits<UT>::bits - 1; - vec<UT, N> aa = bitcast<UT>(a); - vec<UT, N> bb = bitcast<UT>(b); - const vec<UT, N> diff = aa - bb; - aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); - - return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < 0, a, bitcast<T>(diff)); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b) -{ - const vec<T, N> t = allonesvector(a); - return select(a > t - b, t, a + b); -} -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b) -{ - return select(a < b, zerovector(a), a - b); -} - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); } -KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); } -KFR_SINTRIN u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(*x, *y); } -KFR_SINTRIN i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(*x, *y); } - -KFR_SINTRIN u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(*x, *y); } -KFR_SINTRIN i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(*x, *y); } -KFR_SINTRIN u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(*x, *y); } -KFR_SINTRIN i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(*x, *y); } - -KFR_SINTRIN i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); } -KFR_SINTRIN i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); } -KFR_SINTRIN u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); } -KFR_SINTRIN u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); } - -KFR_SINTRIN i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); } -KFR_SINTRIN u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); } - -#if defined CMT_ARCH_AVX2 -KFR_SINTRIN u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(*x, *y); } -KFR_SINTRIN i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(*x, *y); } -KFR_SINTRIN u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(*x, *y); } -KFR_SINTRIN i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(*x, *y); } - -KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(*x, *y); } -KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); } -KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); } -KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); } - -KFR_SINTRIN i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); } -KFR_SINTRIN i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); } -KFR_SINTRIN u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); } -KFR_SINTRIN u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); } - -KFR_SINTRIN i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); } -KFR_SINTRIN u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); } -#endif - -#if defined CMT_ARCH_AVX512 -KFR_SINTRIN u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(*x, *y); } -KFR_SINTRIN i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(*x, *y); } -KFR_SINTRIN u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(*x, *y); } -KFR_SINTRIN i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(*x, *y); } -KFR_SINTRIN u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(*x, *y); } -KFR_SINTRIN i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(*x, *y); } -KFR_SINTRIN u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(*x, *y); } -KFR_SINTRIN i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(*x, *y); } - -KFR_SINTRIN i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); } -KFR_SINTRIN i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); } -KFR_SINTRIN u32avx512 satadd(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_add(a, b); } -KFR_SINTRIN u64avx512 satadd(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_add(a, b); } -KFR_SINTRIN i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN u32avx512 satsub(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_sub(a, b); } -KFR_SINTRIN u64avx512 satsub(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_sub(a, b); } -#endif - -KFR_HANDLE_ALL_SIZES_2(satadd) -KFR_HANDLE_ALL_SIZES_2(satsub) - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); } -KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); } -KFR_SINTRIN u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(*x, *y); } -KFR_SINTRIN i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(*x, *y); } -KFR_SINTRIN u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(*a, *b); } -KFR_SINTRIN i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(*a, *b); } -KFR_SINTRIN u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(*a, *b); } -KFR_SINTRIN i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(*a, *b); } - -KFR_SINTRIN u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(*x, *y); } -KFR_SINTRIN i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(*x, *y); } -KFR_SINTRIN u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(*x, *y); } -KFR_SINTRIN i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(*x, *y); } -KFR_SINTRIN u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(*a, *b); } -KFR_SINTRIN i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(*a, *b); } -KFR_SINTRIN u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(*a, *b); } -KFR_SINTRIN i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(*a, *b); } - -KFR_HANDLE_ALL_SIZES_2(satadd) -KFR_HANDLE_ALL_SIZES_2(satsub) - -#else -// fallback -template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> -KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_signed_add(a, b); -} -template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)> -KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_unsigned_add(a, b); -} -template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> -KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_signed_sub(a, b); -} -template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)> -KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_unsigned_sub(a, b); -} -#endif -KFR_I_CONVERTER(satadd) -KFR_I_CONVERTER(satsub) -} // namespace intrinsics -KFR_I_FN(satadd) -KFR_I_FN(satsub) -} // namespace kfr diff --git a/include/kfr/base/impl/select.hpp b/include/kfr/base/impl/select.hpp @@ -1,261 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../function.hpp" - -namespace kfr -{ -namespace intrinsics -{ - -#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN u8sse select(const maskfor<u8sse>& m, const u8sse& x, const u8sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN u16sse select(const maskfor<u16sse>& m, const u16sse& x, const u16sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN u32sse select(const maskfor<u32sse>& m, const u32sse& x, const u32sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN u64sse select(const maskfor<u64sse>& m, const u64sse& x, const u64sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i8sse select(const maskfor<i8sse>& m, const i8sse& x, const i8sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i16sse select(const maskfor<i16sse>& m, const i16sse& x, const i16sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i32sse select(const maskfor<i32sse>& m, const i32sse& x, const i32sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i64sse select(const maskfor<i64sse>& m, const i64sse& x, const i64sse& y) -{ - return _mm_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN f32sse select(const maskfor<f32sse>& m, const f32sse& x, const f32sse& y) -{ - return _mm_blendv_ps(*y, *x, *m); -} -KFR_SINTRIN f64sse select(const maskfor<f64sse>& m, const f64sse& x, const f64sse& y) -{ - return _mm_blendv_pd(*y, *x, *m); -} - -#if defined CMT_ARCH_AVX -KFR_SINTRIN f64avx select(const maskfor<f64avx>& m, const f64avx& x, const f64avx& y) -{ - return _mm256_blendv_pd(*y, *x, *m); -} -KFR_SINTRIN f32avx select(const maskfor<f32avx>& m, const f32avx& x, const f32avx& y) -{ - return _mm256_blendv_ps(*y, *x, *m); -} -#endif - -#if defined CMT_ARCH_AVX2 -KFR_SINTRIN u8avx select(const maskfor<u8avx>& m, const u8avx& x, const u8avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN u16avx select(const maskfor<u16avx>& m, const u16avx& x, const u16avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN u32avx select(const maskfor<u32avx>& m, const u32avx& x, const u32avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN u64avx select(const maskfor<u64avx>& m, const u64avx& x, const u64avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i8avx select(const maskfor<i8avx>& m, const i8avx& x, const i8avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i16avx select(const maskfor<i16avx>& m, const i16avx& x, const i16avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i32avx select(const maskfor<i32avx>& m, const i32avx& x, const i32avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -KFR_SINTRIN i64avx select(const maskfor<i64avx>& m, const i64avx& x, const i64avx& y) -{ - return _mm256_blendv_epi8(*y, *x, *m); -} -#endif - -#if defined CMT_ARCH_AVX512 -KFR_SINTRIN f64avx512 select(const maskfor<f64avx512>& m, const f64avx512& x, const f64avx512& y) -{ - return _mm512_mask_blend_pd(_mm512_test_epi64_mask(*m, *m), *y, *x); -} -KFR_SINTRIN f32avx512 select(const maskfor<f32avx512>& m, const f32avx512& x, const f32avx512& y) -{ - return _mm512_mask_blend_ps(_mm512_test_epi32_mask(*m, *m), *y, *x); -} -KFR_SINTRIN u8avx512 select(const maskfor<u8avx512>& m, const u8avx512& x, const u8avx512& y) -{ - return _mm512_mask_blend_epi8(_mm512_test_epi8_mask(*m, *m), *y, *x); -} -KFR_SINTRIN u16avx512 select(const maskfor<u16avx512>& m, const u16avx512& x, const u16avx512& y) -{ - return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x); -} -KFR_SINTRIN u32avx512 select(const maskfor<u32avx512>& m, const u32avx512& x, const u32avx512& y) -{ - return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x); -} -KFR_SINTRIN u64avx512 select(const maskfor<u64avx512>& m, const u64avx512& x, const u64avx512& y) -{ - return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x); -} -KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const i8avx512& y) -{ - return _mm512_mask_blend_epi8(_mm512_test_epi8_mask(*m, *m), *y, *x); -} -KFR_SINTRIN i16avx512 select(const maskfor<i16avx512>& m, const i16avx512& x, const i16avx512& y) -{ - return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x); -} -KFR_SINTRIN i32avx512 select(const maskfor<i32avx512>& m, const i32avx512& x, const i32avx512& y) -{ - return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x); -} -KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, const i64avx512& y) -{ - return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x); -} -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> -KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> -KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - return concat(select(low(a.asvec()).asmask(), low(b), low(c)), - select(high(a.asvec()).asmask(), high(b), high(c))); -} - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN f32neon select(const maskfor<f32neon>& m, const f32neon& x, const f32neon& y) -{ - return vbslq_f32(*m, *x, *y); -} - -KFR_SINTRIN i8neon select(const maskfor<i8neon>& m, const i8neon& x, const i8neon& y) -{ - return vbslq_s8(*m, *x, *y); -} -KFR_SINTRIN u8neon select(const maskfor<u8neon>& m, const u8neon& x, const u8neon& y) -{ - return vbslq_u8(*m, *x, *y); -} -KFR_SINTRIN i16neon select(const maskfor<i16neon>& m, const i16neon& x, const i16neon& y) -{ - return vbslq_s16(*m, *x, *y); -} -KFR_SINTRIN u16neon select(const maskfor<u16neon>& m, const u16neon& x, const u16neon& y) -{ - return vbslq_u16(*m, *x, *y); -} -KFR_SINTRIN i32neon select(const maskfor<i32neon>& m, const i32neon& x, const i32neon& y) -{ - return vbslq_s32(*m, *x, *y); -} -KFR_SINTRIN u32neon select(const maskfor<u32neon>& m, const u32neon& x, const u32neon& y) -{ - return vbslq_u32(*m, *x, *y); -} -KFR_SINTRIN i64neon select(const maskfor<i64neon>& m, const i64neon& x, const i64neon& y) -{ - return vbslq_s64(*m, *x, *y); -} -KFR_SINTRIN u64neon select(const maskfor<u64neon>& m, const u64neon& x, const u64neon& y) -{ - return vbslq_u64(*m, *x, *y); -} - -#ifdef CMT_ARCH_NEON64 -KFR_SINTRIN f64neon select(const maskfor<f64neon>& m, const f64neon& x, const f64neon& y) -{ - return vbslq_f64(*m, *x, *y); -} -#else -KFR_SINTRIN f64neon select(const maskfor<f64neon>& m, const f64neon& x, const f64neon& y) -{ - return y ^ ((x ^ y) & m.asvec()); -} -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> -KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> -KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - return concat(select(low(a.asvec()).asmask(), low(b), low(c)), - select(high(a.asvec()).asmask(), high(b), high(c))); -} - -#else - -// fallback -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y) -{ - return y ^ ((x ^ y) & m.asvec()); -} -#endif - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y) -{ - return select(m.asmask(), x, y); -} -} // namespace intrinsics -KFR_I_FN(select) - -} // namespace kfr diff --git a/include/kfr/base/impl/sin_cos.hpp b/include/kfr/base/impl/sin_cos.hpp @@ -1,338 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../abs.hpp" -#include "../constants.hpp" -#include "../function.hpp" -#include "../min_max.hpp" -#include "../operators.hpp" -#include "../round.hpp" -#include "../select.hpp" -#include "../shuffle.hpp" - -#if CMT_HAS_WARNING("-Wc99-extensions") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") -#endif - -namespace kfr -{ - -namespace intrinsics -{ - -template <typename T, size_t N> -KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0) -{ - return select(msk, a0, b0); -} - -template <typename T, size_t N, typename... Ts> -KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0, - const T& a1, const T& b1, const Ts&... values) -{ - return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0)); -} - -template <typename T, size_t N, typename Tprecise = f64> -KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant) -{ - const vec<T, N> xabs = abs(x); - constexpr T div = constants<T>::fold_constant_div; - vec<T, N> y = floor(xabs / div); - quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0)); - - const mask<T, N> msk = (quadrant & 1) != 0; - quadrant = kfr::select(msk, quadrant + 1, quadrant); - y = select(msk, y + T(1.0), y); - quadrant = quadrant & 7; - - constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi); - constexpr T rem1 = constants<T>::fold_constant_rem1; - constexpr T rem2 = constants<T>::fold_constant_rem2; - return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask) -{ - constexpr f32 sin_c2 = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f); - constexpr f32 sin_c4 = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f); - constexpr f32 sin_c6 = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f); - constexpr f32 sin_c8 = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f); - constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f); - constexpr f32 cos_c2 = CMT_FP(-0x8.p-4f, -5.0000000000e-01f); - constexpr f32 cos_c4 = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f); - constexpr f32 cos_c6 = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f); - constexpr f32 cos_c8 = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f); - constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f); - - const vec<f32, N> x2 = folded * folded; - - vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, - cos_c8, sin_c8, cos_c10, sin_c10); - - formula = select(cosmask, formula, formula * folded); - return formula; -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask) -{ - constexpr f64 sin_c2 = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01); - constexpr f64 sin_c4 = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03); - constexpr f64 sin_c6 = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04); - constexpr f64 sin_c8 = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06); - constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08); - constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10); - constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13); - constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14); - constexpr f64 cos_c2 = CMT_FP(-0x8.p-4, -5.000000000000000000e-01); - constexpr f64 cos_c4 = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02); - constexpr f64 cos_c6 = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03); - constexpr f64 cos_c8 = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05); - constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07); - constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09); - constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11); - constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14); - - vec<f64, N> x2 = folded * folded; - vec<f64, N> formula = - trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8, - cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16); - - formula = select(cosmask, formula, formula * folded); - return formula; -} - -template <typename T, size_t N, KFR_ENABLE_IF(N > 1)> -KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask) -{ - vec<itype<T>, N> quadrant; - vec<T, N> folded = trig_fold(x_full, quadrant); - - mask<T, N> flip_sign = - kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask(); - - mask<T, N> usecos = (quadrant == 2) || (quadrant == 6); - usecos = usecos ^ cosmask; - - vec<T, N> formula = trig_sincos(folded, usecos); - - mask<T, N> negmask = x_full < 0; - - flip_sign = flip_sign ^ (negmask & ~cosmask); - - formula = select(flip_sign, -formula, formula); - return formula; -} - -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x) -{ - vec<itype<T>, N> quadrant; - vec<T, N> folded = trig_fold(x, quadrant); - - mask<T, N> flip_sign = quadrant >= 4; - mask<T, N> usecos = (quadrant == 2) || (quadrant == 6); - - vec<T, N> formula = trig_sincos(folded, usecos); - - formula = select(flip_sign ^ mask<T, N>(x), -formula, formula); - return formula; -} - -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x) -{ - vec<itype<T>, N> quadrant; - vec<T, N> folded = trig_fold(x, quadrant); - - mask<T, N> eq4 = (quadrant == 4); - mask<T, N> flip_sign = (quadrant == 2) || eq4; - mask<T, N> usecos = (quadrant == 0) || eq4; - - vec<T, N> formula = trig_sincos(folded, usecos); - - formula = select(flip_sign, -formula, formula); - return formula; -} - -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x) -{ - const vec<T, N> msk = broadcast<N>(constants<T>::highbitmask()); - - constexpr static T c2 = -0.16665853559970855712890625; - constexpr static T c4 = +8.31427983939647674560546875e-3; - constexpr static T c6 = -1.85423981747590005397796630859375e-4; - - const vec<T, N> pi = c_pi<T>; - - vec<T, N> xx = x - pi; - vec<T, N> y = abs(xx); - y = select(y > c_pi<T, 1, 2>, pi - y, y); - y = y ^ (msk & ~xx); - - vec<T, N> y2 = y * y; - vec<T, N> formula = c6; - vec<T, N> y3 = y2 * y; - formula = fmadd(formula, y2, c4); - formula = fmadd(formula, y2, c2); - formula = formula * y3 + y; - return formula; -} - -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> fastcos(const vec<T, N>& x) -{ - x += c_pi<T, 1, 2>; - x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x); - return fastsin(x); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> sincos(const vec<T, N>& x) -{ - return sincos_mask(x, internal::oddmask<T, N>()); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x) -{ - return sincos_mask(x, internal::evenmask<T, N>()); -} - -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x) -{ - return select(abs(x) <= constants<T>::epsilon, T(1), sin(x) / x); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sin(const vec<T, N>& x) -{ - return sin(cast<Tout>(x)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> cos(const vec<T, N>& x) -{ - return cos(cast<Tout>(x)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> fastsin(const vec<T, N>& x) -{ - return fastsin(cast<Tout>(x)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> fastcos(const vec<T, N>& x) -{ - return fastcos(cast<Tout>(x)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sincos(const vec<T, N>& x) -{ - return sincos(cast<Tout>(x)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> cossin(const vec<T, N>& x) -{ - return cossin(cast<Tout>(x)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sinc(const vec<T, N>& x) -{ - return sinc(cast<Tout>(x)); -} - -KFR_I_FLT_CONVERTER(sin) -KFR_I_FLT_CONVERTER(cos) -KFR_I_FLT_CONVERTER(fastsin) -KFR_I_FLT_CONVERTER(fastcos) -KFR_I_FLT_CONVERTER(sincos) -KFR_I_FLT_CONVERTER(cossin) -KFR_I_FLT_CONVERTER(sinc) - -template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout sindeg(const T& x) -{ - return sin(x * constants<Tout>::degtorad); -} - -template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout cosdeg(const T& x) -{ - return cos(x * constants<Tout>::degtorad); -} - -template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout fastsindeg(const T& x) -{ - return fastsin(x * constants<Tout>::degtorad); -} - -template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout fastcosdeg(const T& x) -{ - return fastcos(x * constants<Tout>::degtorad); -} - -template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout sincosdeg(const T& x) -{ - return sincos(x * constants<Tout>::degtorad); -} - -template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout cossindeg(const T& x) -{ - return cossin(x * constants<Tout>::degtorad); -} -} // namespace intrinsics - -KFR_I_FN(sin) -KFR_I_FN(cos) -KFR_I_FN(fastsin) -KFR_I_FN(fastcos) -KFR_I_FN(sincos) -KFR_I_FN(cossin) - -KFR_I_FN(sindeg) -KFR_I_FN(cosdeg) -KFR_I_FN(fastsindeg) -KFR_I_FN(fastcosdeg) -KFR_I_FN(sincosdeg) -KFR_I_FN(cossindeg) - -KFR_I_FN(sinc) - -} // namespace kfr diff --git a/include/kfr/base/impl/sqrt.hpp b/include/kfr/base/impl/sqrt.hpp @@ -1,71 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../function.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(f32x4(_mm_sqrt_ss(*extend<4>(x)))); } -KFR_SINTRIN f64x1 sqrt(const f64x1& x) -{ - return slice<0, 1>(f64x2(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x)))); -} -KFR_SINTRIN f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(*x); } -KFR_SINTRIN f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(*x); } - -#if defined CMT_ARCH_AVX -KFR_SINTRIN f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(*x); } -KFR_SINTRIN f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(*x); } -#endif - -#if defined CMT_ARCH_AVX512 -KFR_SINTRIN f32avx512 sqrt(const f32avx512& x) { return _mm512_sqrt_ps(*x); } -KFR_SINTRIN f64avx512 sqrt(const f64avx512& x) { return _mm512_sqrt_pd(*x); } -#endif - -KFR_HANDLE_ALL_SIZES_FLT_1(sqrt) - -#else - -// fallback -template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sqrt(const vec<T, N>& x) -{ - return apply([](Tout x) { return std::sqrt(x); }, cast<Tout>(x)); -} -#endif -KFR_I_FLT_CONVERTER(sqrt) -} // namespace intrinsics -KFR_I_FN(sqrt) - -} // namespace kfr diff --git a/include/kfr/base/impl/tan.hpp b/include/kfr/base/impl/tan.hpp @@ -1,141 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../abs.hpp" -#include "../constants.hpp" -#include "../function.hpp" -#include "../operators.hpp" -#include "../select.hpp" -#include "../sin_cos.hpp" - -namespace kfr -{ - -namespace intrinsics -{ - -template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse) -{ - constexpr T pi_14 = c_pi<T, 1, 4>; - - vec<T, N> y = abs(x_full); - vec<T, N> scaled = y / pi_14; - - vec<T, N> k_real = floor(scaled); - vec<IT, N> k = cast<IT>(k_real); - - vec<T, N> x = y - k_real * pi_14; - - mask<T, N> need_offset = (k & 1) != 0; - x = select(need_offset, x - pi_14, x); - - vec<IT, N> k_mod4 = k & 3; - inverse = (k_mod4 == 1) || (k_mod4 == 2); - return x; -} - -template <size_t N> -KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full) -{ - mask<f32, N> inverse; - vec<i32, N> quad; - const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse); - inverse = quad == 2 || quad == 6; - - constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01); - constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01); - constexpr f32 tan_c6 = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02); - constexpr f32 tan_c8 = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02); - constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03); - constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03); - - constexpr f32 cot_c2 = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01); - constexpr f32 cot_c4 = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02); - constexpr f32 cot_c6 = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03); - constexpr f32 cot_c8 = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04); - constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05); - constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07); - - const vec<f32, N> x2 = x * x; - const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, - tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12); - - const vec<f32, N> z = select(inverse, val / -x, val * x); - return mulsign(z, x_full); -} - -template <size_t N> -KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full) -{ - mask<f64, N> inverse; - vec<i64, N> quad; - const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse); - inverse = quad == 2 || quad == 6; - - constexpr f64 tan_c2 = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01); - constexpr f64 tan_c4 = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01); - constexpr f64 tan_c6 = CMT_FP(0xd.d0d90de32b3e8p-8, 5.396801556632355862e-02); - constexpr f64 tan_c8 = CMT_FP(0x5.99723bdcf5cacp-8, 2.187265359403693307e-02); - constexpr f64 tan_c10 = CMT_FP(0x2.434a142e413ap-8, 8.839254309582239566e-03); - constexpr f64 tan_c12 = CMT_FP(0xf.2b59061305efp-12, 3.703449009834865711e-03); - constexpr f64 tan_c14 = CMT_FP(0x4.a12565071a664p-12, 1.130243370829653185e-03); - constexpr f64 tan_c16 = CMT_FP(0x4.dada3797ac1bcp-12, 1.185276423238536747e-03); - constexpr f64 tan_c18 = CMT_FP(-0x1.a74976b6ea3f3p-12, -4.036779095551438937e-04); - constexpr f64 tan_c20 = CMT_FP(0x1.d06a5ae5e4a74p-12, 4.429010863244216712e-04); - - constexpr f64 cot_c2 = CMT_FP(-0x5.5555555555554p-4, -3.333333333333333148e-01); - constexpr f64 cot_c4 = CMT_FP(-0x5.b05b05b05b758p-8, -2.222222222222377391e-02); - constexpr f64 cot_c6 = CMT_FP(-0x8.ab355dffc79a8p-12, -2.116402116358796163e-03); - constexpr f64 cot_c8 = CMT_FP(-0xd.debbca405c9f8p-16, -2.116402122295888289e-04); - constexpr f64 cot_c10 = CMT_FP(-0x1.66a8edb99b15p-16, -2.137779458737224013e-05); - constexpr f64 cot_c12 = CMT_FP(-0x2.450239be0ee92p-20, -2.164426049513111728e-06); - constexpr f64 cot_c14 = CMT_FP(-0x3.ad6ddb4719438p-24, -2.191935496317727080e-07); - constexpr f64 cot_c16 = CMT_FP(-0x5.ff4c42741356p-28, -2.234152473099993830e-08); - constexpr f64 cot_c18 = CMT_FP(-0x9.06881bcdf3108p-32, -2.101416316020595077e-09); - constexpr f64 cot_c20 = CMT_FP(-0x1.644abedc113cap-32, -3.240456633529511097e-10); - - const vec<f64, N> x2 = x * x; - const vec<f64, N> val = trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6, - cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14, - cot_c16, tan_c16, cot_c18, tan_c18, cot_c20, tan_c20); - - const vec<f64, N> z = select(inverse, val / -x, val * x); - return mulsign(z, x_full); -} - -KFR_I_FLT_CONVERTER(tan) -template <typename T> -KFR_SINTRIN flt_type<T> tandeg(const T& x) -{ - return tan(x * c_degtorad<flt_type<T>>); -} -} // namespace intrinsics -KFR_I_FN(tan) -KFR_I_FN(tandeg) - -} // namespace kfr diff --git a/include/kfr/base/intrinsics.h b/include/kfr/base/intrinsics.h @@ -1,18 +0,0 @@ -#pragma once - -#include "kfr.h" - -#ifdef CMT_ARCH_SSE2 -#include <immintrin.h> -#ifdef CMT_OS_WIN -#include <intrin.h> -#endif -#endif - -#ifdef CMT_ARCH_NEON -#include <arm_neon.h> -#endif - -#if defined CMT_COMPILER_GCC && defined CMT_ARCH_X86 -#include <x86intrin.h> -#endif diff --git a/include/kfr/base/kfr.h b/include/kfr/base/kfr.h @@ -1,46 +0,0 @@ -/** @addtogroup utility - * @{ - */ -#pragma once - -#include <stddef.h> -#include <stdint.h> - -#include "../cident.h" - -#define KFR_VERSION_MAJOR 3 -#define KFR_VERSION_MINOR 0 -#define KFR_VERSION_BUILD 4 -#define KFR_VERSION_STRING \ - CMT_STRINGIFY(KFR_VERSION_MAJOR) "." CMT_STRINGIFY(KFR_VERSION_MINOR) "." CMT_STRINGIFY(KFR_VERSION_BUILD) -#define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_BUILD) - -#ifdef CMT_ARCH_X64 -#define KFR_VERSION_FULL \ - "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit (" CMT_COMPIER_NAME "/" CMT_OS_NAME \ - ")" -#else -#define KFR_VERSION_FULL \ - "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit (" CMT_COMPIER_NAME "/" CMT_OS_NAME \ - ")" -#endif - -#ifdef __cplusplus -namespace kfr -{ -/// @brief KFR version string -constexpr const char version_string[] = KFR_VERSION_STRING; - -constexpr int version_major = KFR_VERSION_MAJOR; -constexpr int version_minor = KFR_VERSION_MINOR; -constexpr int version_build = KFR_VERSION_BUILD; -constexpr int version = KFR_VERSION; - -/// @brief KFR version string including architecture and compiler name -constexpr const char version_full[] = KFR_VERSION_FULL; -} // namespace kfr -#endif - -#define KFR_INTRIN CMT_INTRIN -#define KFR_FUNC CMT_FUNC -#define KFR_SINTRIN CMT_INTRIN static diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp @@ -1,229 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/log_exp.hpp" - -namespace kfr -{ - -/// @brief Returns e raised to the given power x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> exp(const T1& x) -{ - return intrinsics::exp(x); -} - -/// @brief Returns e raised to the given power x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::exp, E1> exp(E1&& x) -{ - return { fn::exp(), std::forward<E1>(x) }; -} - -/// @brief Returns 2 raised to the given power x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> exp2(const T1& x) -{ - return intrinsics::exp2(x); -} - -/// @brief Returns 2 raised to the given power x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::exp2, E1> exp2(E1&& x) -{ - return { fn::exp2(), std::forward<E1>(x) }; -} - -/// @brief Returns 10 raised to the given power x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> exp10(const T1& x) -{ - return intrinsics::exp10(x); -} - -/// @brief Returns 10 raised to the given power x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::exp10, E1> exp10(E1&& x) -{ - return { fn::exp10(), std::forward<E1>(x) }; -} - -/// @brief Returns the natural logarithm of the x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> log(const T1& x) -{ - return intrinsics::log(x); -} - -/// @brief Returns the natural logarithm of the x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::log, E1> log(E1&& x) -{ - return { fn::log(), std::forward<E1>(x) }; -} - -/// @brief Returns the binary (base-2) logarithm of the x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> log2(const T1& x) -{ - return intrinsics::log2(x); -} - -/// @brief Returns the binary (base-2) logarithm of the x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::log2, E1> log2(E1&& x) -{ - return { fn::log2(), std::forward<E1>(x) }; -} - -/// @brief Returns the common (base-10) logarithm of the x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> log10(const T1& x) -{ - return intrinsics::log10(x); -} - -/// @brief Returns the common (base-10) logarithm of the x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::log10, E1> log10(E1&& x) -{ - return { fn::log10(), std::forward<E1>(x) }; -} - -/// @brief Returns the rounded binary (base-2) logarithm of the x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> logb(const T1& x) -{ - return intrinsics::logb(x); -} - -/// @brief Returns the rounded binary (base-2) logarithm of the x. Version that accepts and returns -/// expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::logb, E1> logb(E1&& x) -{ - return { fn::logb(), std::forward<E1>(x) }; -} - -/// @brief Returns the logarithm of the x with base y. -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> -KFR_FUNC flt_type<common_type<T1, T2>> logn(const T1& x, const T2& y) -{ - return intrinsics::logn(x, y); -} - -/// @brief Returns the logarithm of the x with base y. Version that accepts and returns expressions. -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_FUNC internal::expression_function<fn::logn, E1, E2> logn(E1&& x, E2&& y) -{ - return { fn::logn(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/// @brief Returns log(x) * y. -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> -KFR_FUNC flt_type<common_type<T1, T2>> logm(const T1& x, const T2& y) -{ - return intrinsics::logm(x, y); -} - -/// @brief Returns log(x) * y. Version that accepts and returns expressions. -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_FUNC internal::expression_function<fn::logm, E1, E2> logm(E1&& x, E2&& y) -{ - return { fn::logm(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/// @brief Returns exp(x * m + a). -template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> -KFR_FUNC flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& y, const T3& z) -{ - return intrinsics::exp_fmadd(x, y, z); -} - -/// @brief Returns exp(x * m + a). Version that accepts and returns expressions. -template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> -KFR_FUNC internal::expression_function<fn::exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z) -{ - return { fn::exp_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) }; -} - -/// @brief Returns log(x) * m + a. -template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> -KFR_FUNC flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& y, const T3& z) -{ - return intrinsics::log_fmadd(x, y, z); -} - -/// @brief Returns log(x) * m + a. Version that accepts and returns expressions. -template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> -KFR_FUNC internal::expression_function<fn::log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z) -{ - return { fn::log_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) }; -} - -/// @brief Returns the x raised to the given power y. -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> -KFR_FUNC flt_type<common_type<T1, T2>> pow(const T1& x, const T2& y) -{ - return intrinsics::pow(x, y); -} - -/// @brief Returns the x raised to the given power y. Version that accepts and returns expressions. -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_FUNC internal::expression_function<fn::pow, E1, E2> pow(E1&& x, E2&& y) -{ - return { fn::pow(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/// @brief Returns the real nth root of the x. -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> -KFR_FUNC flt_type<common_type<T1, T2>> root(const T1& x, const T2& y) -{ - return intrinsics::root(x, y); -} - -/// @brief Returns the real nth root of the x. Version that accepts and returns expressions. -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_FUNC internal::expression_function<fn::root, E1, E2> root(E1&& x, E2&& y) -{ - return { fn::root(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/// @brief Returns the cube root of the x. -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> cbrt(const T1& x) -{ - return intrinsics::cbrt(x); -} - -/// @brief Returns the cube root of the x. Version that accepts and returns expressions. -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cbrt, E1> cbrt(E1&& x) -{ - return { fn::cbrt(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -1,50 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/logical.hpp" - -namespace kfr -{ - -/** - * @brief Returns x[0] && x[1] && ... && x[N-1] - */ -template <typename T, size_t N> -KFR_SINTRIN bool all(const mask<T, N>& x) -{ - return intrinsics::bittestall(x.asvec()); -} - -/** - * @brief Returns x[0] || x[1] || ... || x[N-1] - */ -template <typename T, size_t N> -KFR_SINTRIN bool any(const mask<T, N>& x) -{ - return intrinsics::bittestany(x.asvec()); -} -} // namespace kfr diff --git a/include/kfr/base/memory.hpp b/include/kfr/base/memory.hpp @@ -1,4 +1,4 @@ -/** @addtogroup utility +/** @addtogroup memory * @{ */ /* @@ -25,8 +25,8 @@ */ #pragma once -#include "read_write.hpp" -#include "types.hpp" +#include "../simd/read_write.hpp" +#include "../simd/types.hpp" #include <algorithm> #include <atomic> #include <memory> @@ -34,7 +34,7 @@ namespace kfr { -namespace internal +namespace internal_generic { struct memory_statistics @@ -51,6 +51,8 @@ inline memory_statistics& get_memory_statistics() return ms; } +#pragma pack(push, 1) + struct mem_header { u8 offset; @@ -60,13 +62,18 @@ struct mem_header unsigned int references_uint; size_t size; - CMT_INLINE std::atomic_uint& references() { return reinterpret_cast<std::atomic_uint&>(references_uint); } + KFR_MEM_INTRINSIC std::atomic_uint& references() + { + return reinterpret_cast<std::atomic_uint&>(references_uint); + } } #ifdef CMT_GNU_ATTRIBUTES __attribute__((__packed__)) #endif ; +#pragma pack(pop) + inline mem_header* aligned_header(void* ptr) { return ptr_cast<mem_header>(ptr) - 1; } inline size_t aligned_size(void* ptr) { return aligned_header(ptr)->size; } @@ -103,58 +110,58 @@ inline void aligned_free(void* ptr) } inline void aligned_release(void* ptr) { aligned_free(ptr); } -} // namespace internal +} // namespace internal_generic /// @brief Allocates aligned memory template <typename T = void, size_t alignment = platform<>::native_cache_alignment> -CMT_INLINE T* aligned_allocate(size_t size = 1) +KFR_INTRINSIC T* aligned_allocate(size_t size = 1) { T* ptr = static_cast<T*>(CMT_ASSUME_ALIGNED( - internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment), + internal_generic::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment), alignment)); return ptr; } /// @brief Deallocates aligned memory template <typename T = void> -CMT_INLINE void aligned_deallocate(T* ptr) +KFR_INTRINSIC void aligned_deallocate(T* ptr) { - return internal::aligned_free(ptr); + return internal_generic::aligned_free(ptr); } -namespace internal +namespace internal_generic { template <typename T> struct aligned_deleter { - CMT_INLINE void operator()(T* ptr) const { aligned_deallocate(ptr); } + KFR_MEM_INTRINSIC void operator()(T* ptr) const { aligned_deallocate(ptr); } }; -} // namespace internal +} // namespace internal_generic template <typename T> struct autofree { - CMT_INLINE autofree() {} - explicit CMT_INLINE autofree(size_t size) : ptr(aligned_allocate<T>(size)) {} + KFR_MEM_INTRINSIC autofree() {} + explicit KFR_MEM_INTRINSIC autofree(size_t size) : ptr(aligned_allocate<T>(size)) {} autofree(const autofree&) = delete; autofree& operator=(const autofree&) = delete; - autofree(autofree&&) noexcept = default; - autofree& operator=(autofree&&) noexcept = default; - CMT_INLINE T& operator[](size_t index) noexcept { return ptr[index]; } - CMT_INLINE const T& operator[](size_t index) const noexcept { return ptr[index]; } + autofree(autofree&&) CMT_NOEXCEPT = default; + autofree& operator=(autofree&&) CMT_NOEXCEPT = default; + KFR_MEM_INTRINSIC T& operator[](size_t index) CMT_NOEXCEPT { return ptr[index]; } + KFR_MEM_INTRINSIC const T& operator[](size_t index) const CMT_NOEXCEPT { return ptr[index]; } template <typename U = T> - CMT_INLINE U* data() noexcept + KFR_MEM_INTRINSIC U* data() CMT_NOEXCEPT { return ptr_cast<U>(ptr.get()); } template <typename U = T> - CMT_INLINE const U* data() const noexcept + KFR_MEM_INTRINSIC const U* data() const CMT_NOEXCEPT { return ptr_cast<U>(ptr.get()); } - std::unique_ptr<T[], internal::aligned_deleter<T>> ptr; + std::unique_ptr<T[], internal_generic::aligned_deleter<T>> ptr; }; #ifdef KFR_USE_STD_ALLOCATION @@ -181,14 +188,14 @@ struct allocator { using other = allocator<U>; }; - constexpr allocator() noexcept = default; - constexpr allocator(const allocator&) noexcept = default; + constexpr allocator() CMT_NOEXCEPT = default; + constexpr allocator(const allocator&) CMT_NOEXCEPT = default; template <typename U> - constexpr allocator(const allocator<U>&) noexcept + constexpr allocator(const allocator<U>&) CMT_NOEXCEPT { } - pointer address(reference x) const noexcept { return std::addressof(x); } - const_pointer address(const_reference x) const noexcept { return std::addressof(x); } + pointer address(reference x) const CMT_NOEXCEPT { return std::addressof(x); } + const_pointer address(const_reference x) const CMT_NOEXCEPT { return std::addressof(x); } pointer allocate(size_type n, std::allocator<void>::const_pointer = 0) const { pointer result = aligned_allocate<value_type>(n); @@ -211,12 +218,12 @@ struct allocator }; template <typename T1, typename T2> -constexpr inline bool operator==(const allocator<T1>&, const allocator<T2>&) noexcept +constexpr inline bool operator==(const allocator<T1>&, const allocator<T2>&) CMT_NOEXCEPT { return true; } template <typename T1, typename T2> -constexpr inline bool operator!=(const allocator<T1>&, const allocator<T2>&) noexcept +constexpr inline bool operator!=(const allocator<T1>&, const allocator<T2>&) CMT_NOEXCEPT { return false; } @@ -243,4 +250,5 @@ public: \ private: \ mutable std::atomic_uintptr_t m_refcount = ATOMIC_VAR_INIT(0); + } // namespace kfr diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp @@ -1,107 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/min_max.hpp" - -namespace kfr -{ -/** - * @brief Returns the smaller of two values. - */ -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout min(const T1& x, const T2& y) -{ - return intrinsics::min(x, y); -} - -/** - * @brief Returns template expression that returns the smaller of two values. - */ -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::min, E1, E2> min(E1&& x, E2&& y) -{ - return { fn::min(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/** - * @brief Returns the greater of two values. - */ -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout max(const T1& x, const T2& y) -{ - return intrinsics::max(x, y); -} - -/** - * @brief Returns template expression that returns the greater of two values. - */ -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::max, E1, E2> max(E1&& x, E2&& y) -{ - return { fn::max(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/** - * @brief Returns the smaller in magnitude of two values. - */ -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout absmin(const T1& x, const T2& y) -{ - return intrinsics::absmin(x, y); -} - -/** - * @brief Returns template expression that returns the smaller in magnitude of two values. - */ -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::absmin, E1, E2> absmin(E1&& x, E2&& y) -{ - return { fn::absmin(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/** - * @brief Returns the greater in magnitude of two values. - */ -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout absmax(const T1& x, const T2& y) -{ - return intrinsics::absmax(x, y); -} - -/** - * @brief Returns template expression that returns the greater in magnitude of two values. - */ -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::absmax, E1, E2> absmax(E1&& x, E2&& y) -{ - return { fn::absmax(), std::forward<E1>(x), std::forward<E2>(y) }; -} -} // namespace kfr diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp @@ -1,44 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/modzerobessel.hpp" - -namespace kfr -{ - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 modzerobessel(const T1& x) -{ - return intrinsics::modzerobessel(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1&& x) -{ - return { fn::modzerobessel(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp @@ -1,552 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "bitwise.hpp" -#include "function.hpp" -#include <algorithm> -#include <utility> - -namespace kfr -{ - -template <typename T> -constexpr inline T add(const T& x) -{ - return x; -} - -/** - * @brief Returns sum of all the arguments passed to a function. - */ -template <typename T1, typename T2, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, T2, Ts...>::value)> -constexpr inline common_type<T1, T2, Ts...> add(const T1& x, const T2& y, const Ts&... rest) -{ - return x + add(y, rest...); -} -template <typename T> -constexpr inline T add(initialvalue<T>) -{ - return T(0); -} -KFR_FN(add) - -/** - * @brief Returns template expression that returns sum of all the arguments passed to a function. - */ -template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)> -CMT_INLINE internal::expression_function<fn::add, E...> add(E&&... x) -{ - return { fn::add(), std::forward<E>(x)... }; -} - -template <typename T1, typename T2> -constexpr inline common_type<T1, T2> sub(const T1& x, const T2& y) -{ - return x - y; -} -template <typename T> -constexpr inline T sub(initialvalue<T>) -{ - return T(0); -} -KFR_FN(sub) - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::sub, E1, E2> sub(E1&& x, E2&& y) -{ - return { fn::sub(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -template <typename T1> -constexpr inline T1 mul(const T1& x) -{ - return x; -} - -/** - * @brief Returns product of all the arguments passed to a function. - */ -template <typename T1, typename T2, typename... Ts> -constexpr inline common_type<T1, T2, Ts...> mul(const T1& x, const T2& y, const Ts&... rest) -{ - return x * mul(y, rest...); -} - -template <typename T> -constexpr inline T mul(initialvalue<T>) -{ - return T(1); -} -KFR_FN(mul) - -/** - * @brief Returns template expression that returns product of all the arguments passed to a function. - */ -template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> -CMT_INLINE internal::expression_function<fn::mul, E...> mul(E&&... x) -{ - return { fn::mul(), std::forward<E>(x)... }; -} - -/** - * @brief Returns square of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -constexpr inline T1 sqr(const T1& x) -{ - return x * x; -} -KFR_FN(sqr) - -/** - * @brief Returns template expression that returns square of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::sqr, E1> sqr(E1&& x) -{ - return { fn::sqr(), std::forward<E1>(x) }; -} - -/** - * @brief Returns cube of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -constexpr inline T1 cub(const T1& x) -{ - return sqr(x) * x; -} -KFR_FN(cub) - -/** - * @brief Returns template expression that returns cube of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::cub, E1> cub(E1&& x) -{ - return { fn::cub(), std::forward<E1>(x) }; -} - -template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> -constexpr CMT_INLINE T pow2(const T& x) -{ - return sqr(x); -} - -template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> -constexpr CMT_INLINE T pow3(const T& x) -{ - return cub(x); -} - -template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> -constexpr CMT_INLINE T pow4(const T& x) -{ - return sqr(sqr(x)); -} - -template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> -constexpr CMT_INLINE T pow5(const T& x) -{ - return pow4(x) * x; -} -KFR_FN(pow2) -KFR_FN(pow3) -KFR_FN(pow4) -KFR_FN(pow5) - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::pow2, E1> pow2(E1&& x) -{ - return { fn::pow2(), std::forward<E1>(x) }; -} -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::pow3, E1> pow3(E1&& x) -{ - return { fn::pow3(), std::forward<E1>(x) }; -} -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::pow4, E1> pow4(E1&& x) -{ - return { fn::pow4(), std::forward<E1>(x) }; -} -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::pow5, E1> pow5(E1&& x) -{ - return { fn::pow5(), std::forward<E1>(x) }; -} - -/// Raise x to the power base \f$ x^{base} \f$ -/// @code -/// CHECK( ipow( 10, 3 ) == 1000 ); -/// CHECK( ipow( 0.5, 2 ) == 0.25 ); -/// @endcode -template <typename T> -constexpr inline T ipow(const T& x, int base) -{ - T xx = x; - T result = T(1); - while (base) - { - if (base & 1) - result *= xx; - base >>= 1; - xx *= xx; - } - return result; -} -KFR_FN(ipow) - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::ipow, E1, E2> ipow(E1&& x, E2&& b) -{ - return { fn::ipow(), std::forward<E1>(x), std::forward<E2>(b) }; -} - -/// Return square of the sum of all arguments -/// @code -/// CHECK(sqrsum(1,2,3) == 36); -/// @endcode -template <typename T1, typename... Ts> -constexpr inline common_type<T1, Ts...> sqrsum(const T1& x, const Ts&... rest) -{ - return sqr(add(x, rest...)); -} - -template <typename T1, typename T2> -constexpr inline common_type<T1, T2> sqrdiff(const T1& x, const T2& y) -{ - return sqr(x - y); -} -KFR_FN(sqrsum) -KFR_FN(sqrdiff) - -/// Division -template <typename T1, typename T2, typename Tout = common_type<T1, T2>> -CMT_INLINE Tout div(const T1& x, const T2& y) -{ - return static_cast<Tout>(x) / static_cast<Tout>(y); -} -KFR_FN(div) - -/// Remainder -template <typename T1, typename T2, typename Tout = common_type<T1, T2>> -CMT_INLINE Tout rem(const T1& x, const T2& y) -{ - return static_cast<Tout>(x) % static_cast<Tout>(y); -} -KFR_FN(rem) - -/// Negation -template <typename T1> -inline T1 neg(const T1& x) -{ - return -x; -} -KFR_FN(neg) - -/// @brief Fused Multiply-Add -template <typename T1, typename T2, typename T3> -KFR_INTRIN constexpr common_type<T1, T2, T3> fmadd(const T1& x, const T2& y, const T3& z) -{ - return x * y + z; -} -/// @brief Fused Multiply-Sub -template <typename T1, typename T2, typename T3> -KFR_INTRIN constexpr common_type<T1, T2, T3> fmsub(const T1& x, const T2& y, const T3& z) -{ - return x * y - z; -} -KFR_FN(fmadd) -KFR_FN(fmsub) - -/// @brief Linear blend of `x` and `y` (`c` must be in the range 0...+1) -/// Returns `x + ( y - x ) * c` -template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> -KFR_INTRIN constexpr common_type<T1, T2, T3> mix(const T1& c, const T2& x, const T3& y) -{ - return fmadd(c, y - x, x); -} - -/// @brief Linear blend of `x` and `y` (`c` must be in the range -1...+1) -template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> -KFR_INTRIN constexpr common_type<T1, T2, T3> mixs(const T1& c, const T2& x, const T3& y) -{ - return mix(fmadd(c, 0.5, 0.5), x, y); -} -KFR_FN(mix) -KFR_FN(mixs) - -template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> -CMT_INLINE internal::expression_function<fn::mix, E1, E2, E3> mix(E1&& c, E2&& x, E3&& y) -{ - return { fn::mix(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) }; -} - -template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> -CMT_INLINE internal::expression_function<fn::mixs, E1, E2, E3> mixs(E1&& c, E2&& x, E3&& y) -{ - return { fn::mixs(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) }; -} - -namespace internal -{ - -template <typename T1, typename T2> -constexpr CMT_INLINE common_type<T1, T2> horner(const T1&, const T2& c0) -{ - return c0; -} - -template <typename T1, typename T2, typename T3, typename... Ts> -constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner(const T1& x, const T2& c0, const T3& c1, - const Ts&... values) -{ - return fmadd(horner(x, c1, values...), x, c0); -} - -template <typename T1, typename T2> -constexpr CMT_INLINE common_type<T1, T2> horner_even(const T1&, const T2& c0) -{ - return c0; -} - -template <typename T1, typename T2, typename T3, typename... Ts> -constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner_even(const T1& x, const T2& c0, const T3& c2, - const Ts&... values) -{ - const T1 x2 = x * x; - return fmadd(horner(x2, c2, values...), x2, c0); -} - -template <typename T1, typename T2> -constexpr CMT_INLINE common_type<T1, T2> horner_odd(const T1& x, const T2& c1) -{ - return c1 * x; -} - -template <typename T1, typename T2, typename T3, typename... Ts> -constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner_odd(const T1& x, const T2& c1, const T3& c3, - const Ts&... values) -{ - const T1 x2 = x * x; - return fmadd(horner(x2, c3, values...), x2, c1) * x; -} -} // namespace internal - -/// @brief Calculate polynomial using Horner's method -/// -/// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\) -template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)> -constexpr CMT_INLINE common_type<T1, Ts...> horner(const T1& x, const Ts&... c) -{ - return internal::horner(x, c...); -} -KFR_FN(horner) - -template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> -CMT_INLINE internal::expression_function<fn::horner, E...> horner(E&&... x) -{ - return { fn::horner(), std::forward<E>(x)... }; -} - -/// @brief Calculate polynomial using Horner's method (even powers) -/// -/// ``horner_even(x, 1, 2, 3)`` is equivalent to \(3x^4 + 2x^2 + 1\) -template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)> -constexpr CMT_INLINE common_type<T1, Ts...> horner_even(const T1& x, const Ts&... c) -{ - return internal::horner_even(x, c...); -} -KFR_FN(horner_even) - -template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> -CMT_INLINE internal::expression_function<fn::horner_even, E...> horner_even(E&&... x) -{ - return { fn::horner_even(), std::forward<E>(x)... }; -} - -/// @brief Calculate polynomial using Horner's method (odd powers) -/// -/// ``horner_odd(x, 1, 2, 3)`` is equivalent to \(3x^5 + 2x^3 + 1x\) -template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)> -constexpr CMT_INLINE common_type<T1, Ts...> horner_odd(const T1& x, const Ts&... c) -{ - return internal::horner_odd(x, c...); -} -KFR_FN(horner_odd) - -template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> -CMT_INLINE internal::expression_function<fn::horner_odd, E...> horner_odd(E&&... x) -{ - return { fn::horner_odd(), std::forward<E>(x)... }; -} - -/// @brief Calculate Multiplicative Inverse of `x` -/// Returns `1/x` -template <typename T> -constexpr CMT_INLINE T reciprocal(const T& x) -{ - static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type"); - return subtype<T>(1) / x; -} -KFR_FN(reciprocal) - -template <typename T1, typename T2> -CMT_INLINE common_type<T1, T2> mulsign(const T1& x, const T2& y) -{ - return bitwisexor(x, bitwiseand(y, constants<T2>::highbitmask())); -} -KFR_FN(mulsign) - -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y) -{ - return (x & constants<T>::highbitmask()) | (y & constants<T>::highbitmask()); -} - -/// @brief Swap byte order -template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)> -CMT_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x) -{ - return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x))); -} -template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)> -CMT_INLINE T swapbyteorder(const T& x) -{ - return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x))); -} -template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)> -CMT_INLINE T swapbyteorder(const T& x) -{ - return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x))); -} -template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)> -CMT_INLINE T swapbyteorder(const T& x) -{ - return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x))); -} -KFR_FN(swapbyteorder) - -template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b) -{ - return blend<1, 0>(a + b, a - b); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b) -{ - return blend<0, 1>(a + b, a - b); -} -KFR_FN(subadd) -KFR_FN(addsub) - -template <typename T, size_t N> -CMT_INLINE vec<T, N> negeven(const vec<T, N>& x) -{ - return x ^ broadcast<N>(-T(), T()); -} -template <typename T, size_t N> -CMT_INLINE vec<T, N> negodd(const vec<T, N>& x) -{ - return x ^ broadcast<N>(T(), -T()); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::neg, E1> operator-(E1&& e1) -{ - return { fn::neg(), std::forward<E1>(e1) }; -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_INLINE internal::expression_function<fn::bitwisenot, E1> operator~(E1&& e1) -{ - return { fn::bitwisenot(), std::forward<E1>(e1) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::add, E1, E2> operator+(E1&& e1, E2&& e2) -{ - return { fn::add(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::sub, E1, E2> operator-(E1&& e1, E2&& e2) -{ - return { fn::sub(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::mul, E1, E2> operator*(E1&& e1, E2&& e2) -{ - return { fn::mul(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::div, E1, E2> operator/(E1&& e1, E2&& e2) -{ - return { fn::div(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::bitwiseand, E1, E2> operator&(E1&& e1, E2&& e2) -{ - return { fn::bitwiseand(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::bitwiseor, E1, E2> operator|(E1&& e1, E2&& e2) -{ - return { fn::bitwiseor(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::bitwisexor, E1, E2> operator^(E1&& e1, E2&& e2) -{ - return { fn::bitwisexor(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::shl, E1, E2> operator<<(E1&& e1, E2&& e2) -{ - return { fn::shl(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::shr, E1, E2> operator>>(E1&& e1, E2&& e2) -{ - return { fn::shr(), std::forward<E1>(e1), std::forward<E2>(e2) }; -} - -template <typename T, size_t N1, size_t... Ns> -vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest) -{ - const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...)); - return compcast<vec<T, sizeof...(Ns) + 1>>(t); -} - -KFR_FN(packtranspose) -} // namespace kfr diff --git a/include/kfr/base/platform.hpp b/include/kfr/base/platform.hpp @@ -1,186 +0,0 @@ -/** @addtogroup types - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "types.hpp" - -namespace kfr -{ - -/// @brief An enumeration representing cpu instruction set -enum class cpu_t : int -{ - common = 0, -#ifdef CMT_ARCH_X86 - sse2 = 1, - sse3 = 2, - ssse3 = 3, - sse41 = 4, - sse42 = 5, - avx1 = 6, - avx2 = 7, - avx512 = 8, // F, CD, VL, DQ and BW - avx = static_cast<int>(avx1), - lowest = static_cast<int>(sse2), - highest = static_cast<int>(avx512), -#endif -#ifdef CMT_ARCH_ARM - neon = 1, - neon64 = 2, - lowest = static_cast<int>(neon), - highest = static_cast<int>(neon64), -#endif - native = static_cast<int>(CMT_ARCH_NAME), - runtime = -1, -}; - -#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native - -template <cpu_t cpu> -using ccpu_t = cval_t<cpu_t, cpu>; - -template <cpu_t cpu> -constexpr ccpu_t<cpu> ccpu{}; - -namespace internal -{ -constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); } -constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); } - -#ifdef CMT_ARCH_X86 -constexpr auto cpu_list = cvals_t<cpu_t, cpu_t::avx512, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3, - cpu_t::sse3, cpu_t::sse2>(); -#else -constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>; -#endif -} // namespace internal - -template <cpu_t cpu> -using cpuval_t = cval_t<cpu_t, cpu>; -template <cpu_t cpu> -constexpr auto cpuval = cpuval_t<cpu>{}; - -constexpr auto cpu_all = cfilter(internal::cpu_list, internal::cpu_list >= cpuval_t<cpu_t::native>()); - -/// @brief Returns name of the cpu instruction set -CMT_UNUSED static const char* cpu_name(cpu_t set) -{ -#ifdef CMT_ARCH_X86 - static const char* names[] = { "common", "sse2", "sse3", "ssse3", "sse41", - "sse42", "avx1", "avx2", "avx512" }; -#endif -#ifdef CMT_ARCH_ARM - static const char* names[] = { "common", "neon", "neon64" }; -#endif - if (set >= cpu_t::lowest && set <= cpu_t::highest) - return names[static_cast<size_t>(set)]; - return "-"; -} - -#ifdef CMT_ARCH_X64 -template <int = 0> -constexpr inline const char* bitness_const(const char*, const char* x64) -{ - return x64; -} -template <typename T> -constexpr inline const T& bitness_const(const T&, const T& x64) -{ - return x64; -} -#else -template <int = 0> -constexpr inline const char* bitness_const(const char* x32, const char*) -{ - return x32; -} -template <typename T> -constexpr inline const T& bitness_const(const T& x32, const T&) -{ - return x32; -} -#endif - -template <typename T = i32, cpu_t c = cpu_t::native> -struct platform -{ - constexpr static size_t native_cache_alignment = 64; - constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1; - constexpr static size_t maximum_vector_alignment = 32; - constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1; -#ifdef CMT_ARCH_X86 - constexpr static size_t simd_register_count = - c >= cpu_t::avx512 ? bitness_const(8, 32) : bitness_const(8, 16); -#endif -#ifdef CMT_ARCH_ARM - constexpr static size_t simd_register_count = 16; -#endif - - constexpr static size_t common_float_vector_size = 16; - constexpr static size_t common_int_vector_size = 16; - -#ifdef CMT_ARCH_X86 - constexpr static size_t native_float_vector_size = - c >= cpu_t::avx512 ? 64 : c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size; -#endif -#ifdef CMT_ARCH_ARM - constexpr static size_t native_float_vector_size = c == cpu_t::neon ? 16 : common_float_vector_size; -#endif -#ifdef CMT_ARCH_X86 - constexpr static size_t native_int_vector_size = - c >= cpu_t::avx512 ? 64 : c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size; -#endif -#ifdef CMT_ARCH_ARM - constexpr static size_t native_int_vector_size = c == cpu_t::neon ? 16 : common_int_vector_size; -#endif - - /// @brief SIMD vector width for the given cpu instruction set - constexpr static size_t vector_width = - (const_max(size_t(1), typeclass<T> == datatype::f ? native_float_vector_size / sizeof(T) - : native_int_vector_size / sizeof(T))); - - constexpr static size_t vector_capacity = simd_register_count * vector_width; - - constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity / 4); - - constexpr static size_t native_vector_alignment = - const_max(native_float_vector_size, native_int_vector_size); - - constexpr static bool fast_unaligned = -#ifdef CMT_ARCH_X86 - c >= cpu_t::avx1; -#else - false; -#endif - - constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; -}; - -template <typename T, size_t N = platform<T>::vector_width> -struct vec; -template <typename T, size_t N = platform<T>::vector_width> -struct mask; -} // namespace kfr diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp @@ -25,14 +25,17 @@ */ #pragma once +#include "../simd/vec.hpp" #include "basic_expressions.hpp" -#include "vec.hpp" #include <memory> namespace kfr { +inline namespace CMT_ARCH_NAME +{ -constexpr size_t maximum_expression_width = platform<float>::vector_capacity / 4; +template <typename T> +constexpr size_t maximum_expression_width = vector_width<T> * 2; template <typename T, bool enable_resource = true> struct expression_pointer; @@ -41,11 +44,11 @@ namespace internal { template <typename Expression, typename T, size_t key = 0> -KFR_SINTRIN bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, - csize_t<key> = {}); +KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, + csize_t<key> = {}); } -template <typename T, size_t N = maximum_expression_width> +template <typename T, size_t N = maximum_expression_width<T>> struct expression_vtable : expression_vtable<T, N / 2> { using func_get = void (*)(void*, size_t, vec<T, N>&); @@ -60,7 +63,7 @@ struct expression_vtable : expression_vtable<T, N / 2> template <typename Expression> static void static_get(void* instance, size_t index, vec<T, N>& result) { - result = static_cast<Expression*>(instance)->operator()(cinput, index, vec_t<T, N>()); + result = get_elements(*static_cast<Expression*>(instance), cinput, index, vec_shape<T, N>()); } }; @@ -78,7 +81,7 @@ struct expression_vtable<T, 0> func_substitute substitute; template <typename Expression> - expression_vtable(ctype_t<Expression> t) + expression_vtable(ctype_t<Expression>) : size(&expression_vtable<T, 0>::template static_size<Expression>), begin_block(&expression_vtable<T, 0>::template static_begin_block<Expression>), end_block(&expression_vtable<T, 0>::template static_end_block<Expression>), @@ -117,7 +120,7 @@ struct expression_resource template <typename E> struct expression_resource_impl : expression_resource { - expression_resource_impl(E&& e) noexcept : e(std::move(e)) {} + expression_resource_impl(E&& e) CMT_NOEXCEPT : e(std::move(e)) {} virtual ~expression_resource_impl() {} virtual void* instance() override final { return &e; } @@ -126,7 +129,7 @@ private: }; template <typename E> -KFR_SINTRIN std::shared_ptr<expression_resource> make_resource(E&& e) +KFR_INTRINSIC std::shared_ptr<expression_resource> make_resource(E&& e) { using T = expression_resource_impl<decay<E>>; return std::static_pointer_cast<expression_resource>( @@ -138,31 +141,35 @@ struct expression_pointer : input_expression { using value_type = T; - expression_pointer() noexcept : instance(nullptr), vtable(nullptr) {} + expression_pointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {} expression_pointer(void* instance, const expression_vtable<T>* vtable, std::shared_ptr<expression_resource> resource = nullptr) : instance(instance), vtable(vtable), resource(std::move(resource)) { } - template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width)> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const + template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width<T>)> + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t, size_t index, + vec_shape<T, N>) { static_assert(is_poweroftwo(N), "N must be a power of two"); vec<T, N> result; - static_cast<const expression_vtable<T, N>*>(vtable)->get(instance, index, result); + static_cast<const expression_vtable<T, N>*>(self.vtable)->get(self.instance, index, result); return result; } - template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width)> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width<T>)> + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t cinput, + size_t index, vec_shape<T, N>) { - return concat(operator()(cinput, index, vec_t<T, N / 2>()), operator()(cinput, index + N / 2, - vec_t<T, N / 2>())); + static_assert(is_poweroftwo(N), "N must be a power of two"); + const vec<T, N / 2> r1 = get_elements(self, cinput, index, vec_shape<T, N / 2>()); + const vec<T, N / 2> r2 = get_elements(self, cinput, index + N / 2, vec_shape<T, N / 2>()); + return concat(r1, r2); } - CMT_INLINE void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); } - CMT_INLINE void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); } - CMT_INLINE size_t size() const { return vtable->size(instance); } + KFR_MEM_INTRINSIC void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); } + KFR_MEM_INTRINSIC void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); } + KFR_MEM_INTRINSIC size_t size() const { return vtable->size(instance); } - CMT_INLINE bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const + KFR_MEM_INTRINSIC bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const { return vtable->substitute(instance, std::move(new_pointer)); } @@ -179,7 +186,7 @@ namespace internal { template <typename T, typename E> -CMT_INLINE expression_vtable<T>* make_expression_vtable() +KFR_INTRINSIC expression_vtable<T>* make_expression_vtable() { static_assert(is_input_expression<E>::value, "E must be an expression"); static expression_vtable<T> vtable{ ctype_t<decay<E>>{} }; @@ -192,7 +199,7 @@ CMT_INLINE expression_vtable<T>* make_expression_vtable() * @warning Use with caution with local variables. */ template <typename E, typename T = value_type_of<E>> -CMT_INLINE expression_pointer<T> to_pointer(E& expr) +KFR_INTRINSIC expression_pointer<T> to_pointer(E& expr) { static_assert(is_input_expression<E>::value, "E must be an expression"); return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, E>()); @@ -203,7 +210,7 @@ CMT_INLINE expression_pointer<T> to_pointer(E& expr) * @note Use std::move to force use of this overload. */ template <typename E, typename T = value_type_of<E>> -CMT_INLINE expression_pointer<T> to_pointer(E&& expr) +KFR_INTRINSIC expression_pointer<T> to_pointer(E&& expr) { static_assert(is_input_expression<E>::value, "E must be an expression"); std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr)); @@ -215,24 +222,25 @@ template <typename T, size_t key> class expression_placeholder : public input_expression { public: - using value_type = T; - expression_placeholder() noexcept = default; + using value_type = T; + expression_placeholder() CMT_NOEXCEPT = default; template <typename U, size_t N> - CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const + friend KFR_INTRINSIC vec<U, N> get_elements(const expression_placeholder& self, cinput_t, + size_t index, vec_shape<U, N>) { - return pointer ? cast<U>(pointer(cinput, index, vec_t<T, N>())) : 0; + return self.pointer ? elemcast<U>(get_elements(self.pointer, cinput, index, vec_shape<T, N>())) : 0; } expression_pointer<T> pointer; }; template <typename T, size_t key = 0> -KFR_SINTRIN expression_placeholder<T, key> placeholder(csize_t<key> = csize_t<key>{}) +KFR_INTRINSIC expression_placeholder<T, key> placeholder(csize_t<key> = csize_t<key>{}) { return expression_placeholder<T, key>(); } template <typename... Args> -KFR_SINTRIN bool substitute(input_expression&, Args&&...) +KFR_INTRINSIC bool substitute(input_expression&, Args&&...) { return false; } @@ -240,28 +248,28 @@ KFR_SINTRIN bool substitute(input_expression&, Args&&...) namespace internal { template <typename... Args, typename T, size_t key, size_t... indices> -KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer, - csize_t<key>, csizes_t<indices...>); +KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr, + expression_pointer<T>&& new_pointer, csize_t<key>, csizes_t<indices...>); } template <typename T, size_t key = 0> -KFR_SINTRIN bool substitute(expression_placeholder<T, key>& expr, expression_pointer<T>&& new_pointer, - csize_t<key> = csize_t<key>{}) +KFR_INTRINSIC bool substitute(expression_placeholder<T, key>& expr, expression_pointer<T>&& new_pointer, + csize_t<key> = csize_t<key>{}) { expr.pointer = std::move(new_pointer); return true; } template <typename... Args, typename T, size_t key = 0> -KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer, - csize_t<key> = csize_t<key>{}) +KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr, + expression_pointer<T>&& new_pointer, csize_t<key> = csize_t<key>{}) { return internal::substitute(expr, std::move(new_pointer), csize_t<key>{}, indicesfor_t<Args...>{}); } template <typename T, size_t key = 0> -KFR_SINTRIN bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& new_pointer, - csize_t<key> = csize_t<key>{}) +KFR_INTRINSIC bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& new_pointer, + csize_t<key> = csize_t<key>{}) { return expr.substitute(std::move(new_pointer), csize_t<key>{}); } @@ -269,17 +277,17 @@ KFR_SINTRIN bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& namespace internal { -KFR_SINTRIN bool var_or() { return false; } +KFR_INTRINSIC bool var_or() { return false; } template <typename... Args> -KFR_SINTRIN bool var_or(bool b, Args... args) +KFR_INTRINSIC bool var_or(bool b, Args... args) { return b || var_or(args...); } template <typename... Args, typename T, size_t key, size_t... indices> -KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer, - csize_t<key>, csizes_t<indices...>) +KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr, + expression_pointer<T>&& new_pointer, csize_t<key>, csizes_t<indices...>) { return var_or(substitute(std::get<indices>(expr.args), std::move(new_pointer), csize_t<key>())...); } @@ -290,10 +298,11 @@ namespace internal { template <typename Expression, typename T, size_t key> -KFR_SINTRIN bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, csize_t<key>) +KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, csize_t<key>) { return kfr::substitute(expr, std::move(new_pointer), csize_t<key>{}); } } // namespace internal +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp @@ -1,4 +1,4 @@ -/** @addtogroup math +/** @addtogroup random * @{ */ /* @@ -24,55 +24,58 @@ See https://www.kfrlib.com for details. */ #pragma once -#include "function.hpp" -#include "operators.hpp" -#include "shuffle.hpp" -#include "vec.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/operators.hpp" +#include "../simd/shuffle.hpp" +#include "../simd/vec.hpp" namespace kfr { -using random_state = u32x4; - #ifndef KFR_DISABLE_READCYCLECOUNTER - struct seed_from_rdtsc_t { }; constexpr seed_from_rdtsc_t seed_from_rdtsc{}; +#endif + +inline namespace CMT_ARCH_NAME +{ + +using random_state = u32x4; -#ifndef KFR_READCYCLECOUNTER +#ifndef KFR_DISABLE_READCYCLECOUNTER #ifdef CMT_COMPILER_CLANG -#define KFR_READCYCLECOUNTER() __builtin_readcyclecounter() +#define KFR_builtin_readcyclecounter() \ + static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here #else -#define KFR_READCYCLECOUNTER() __rdtsc() +#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc()) #endif #endif -#endif - struct random_bit_generator { - #ifndef KFR_DISABLE_READCYCLECOUNTER - random_bit_generator(seed_from_rdtsc_t) noexcept - : state(bitcast<u32>( - make_vector(KFR_READCYCLECOUNTER(), (KFR_READCYCLECOUNTER() << 11) ^ 0x710686d615e2257bull))) + KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT + : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(), + (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull))) { (void)operator()(); } #endif - random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3) + KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT + : state(x0, x1, x2, x3) { (void)operator()(); } - random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1))) + KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT + : state(bitcast<u32>(make_vector(x0, x1))) { (void)operator()(); } - inline random_state operator()() + KFR_MEM_INTRINSIC random_state operator()() { const static random_state mul{ 214013u, 17405u, 214013u, 69069u }; const static random_state add{ 2531011u, 10395331u, 13737667u, 1u }; @@ -87,13 +90,13 @@ protected: static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16"); template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))> -inline vec<u8, N> random_bits(random_bit_generator& gen) +KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen) { return narrow<N>(bitcast<u8>(gen())); } template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))> -inline vec<u8, N> random_bits(random_bit_generator& gen) +KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen) { constexpr size_t N2 = prev_poweroftwo(N - 1); const vec<u8, N2> bits1 = random_bits<N2>(gen); @@ -102,37 +105,37 @@ inline vec<u8, N> random_bits(random_bit_generator& gen) } template <typename T, size_t N, KFR_ENABLE_IF(std::is_integral<T>::value)> -inline vec<T, N> random_uniform(random_bit_generator& gen) +KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen) { return bitcast<T>(random_bits<N * sizeof(T)>(gen)); } template <typename T, size_t N, KFR_ENABLE_IF(std::is_same<T, f32>::value)> -inline vec<f32, N> randommantissa(random_bit_generator& gen) +KFR_INTRINSIC vec<f32, N> randommantissa(random_bit_generator& gen) { return bitcast<f32>((random_uniform<u32, N>(gen) & 0x7FFFFFu) | 0x3f800000u) + 0.0f; } template <typename T, size_t N, KFR_ENABLE_IF(std::is_same<T, f64>::value)> -inline vec<f64, N> randommantissa(random_bit_generator& gen) +KFR_INTRINSIC vec<f64, N> randommantissa(random_bit_generator& gen) { return bitcast<f64>((random_uniform<u64, N>(gen) & 0x000FFFFFFFFFFFFFull) | 0x3FF0000000000000ull) + 0.0; } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -inline vec<T, N> random_uniform(random_bit_generator& gen) +KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen) { return randommantissa<T, N>(gen) - 1.f; } template <size_t N, typename T, KFR_ENABLE_IF(is_f_class<T>::value)> -inline vec<T, N> random_range(random_bit_generator& gen, T min, T max) +KFR_INTRINSIC vec<T, N> random_range(random_bit_generator& gen, T min, T max) { return mix(random_uniform<T, N>(gen), min, max); } template <size_t N, typename T, KFR_ENABLE_IF(!is_f_class<T>::value)> -inline vec<T, N> random_range(random_bit_generator& gen, T min, T max) +KFR_INTRINSIC vec<T, N> random_range(random_bit_generator& gen, T min, T max) { using big_type = findinttype<sqr(std::numeric_limits<T>::min()), sqr(std::numeric_limits<T>::max())>; @@ -147,11 +150,11 @@ template <typename T> struct expression_random_uniform : input_expression { using value_type = T; - constexpr expression_random_uniform(const random_bit_generator& gen) noexcept : gen(gen) {} + constexpr expression_random_uniform(const random_bit_generator& gen) CMT_NOEXCEPT : gen(gen) {} template <size_t N> - vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const + friend vec<T, N> get_elements(const expression_random_uniform& self, cinput_t, size_t, vec_shape<T, N>) { - return random_uniform<T, N>(gen); + return random_uniform<T, N>(self.gen); } mutable random_bit_generator gen; }; @@ -160,15 +163,16 @@ template <typename T> struct expression_random_range : input_expression { using value_type = T; - constexpr expression_random_range(const random_bit_generator& gen, T min, T max) noexcept - : gen(gen), min(min), max(max) + constexpr expression_random_range(const random_bit_generator& gen, T min, T max) CMT_NOEXCEPT : gen(gen), + min(min), + max(max) { } template <size_t N> - vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const + friend vec<T, N> get_elements(const expression_random_range& self, cinput_t, size_t, vec_shape<T, N>) { - return random_range<N, T>(gen, min, max); + return random_range<N, T>(self.gen, self.min, self.max); } mutable random_bit_generator gen; const T min; @@ -178,16 +182,15 @@ struct expression_random_range : input_expression /// @brief Returns expression that returns pseudo random values template <typename T> -inline internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen) +KFR_FUNCTION internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen) { return internal::expression_random_uniform<T>(gen); } - #ifndef KFR_DISABLE_READCYCLECOUNTER /// @brief Returns expression that returns pseudo random values template <typename T> -inline internal::expression_random_uniform<T> gen_random_uniform() +KFR_FUNCTION internal::expression_random_uniform<T> gen_random_uniform() { return internal::expression_random_uniform<T>(random_bit_generator(seed_from_rdtsc)); } @@ -195,18 +198,19 @@ inline internal::expression_random_uniform<T> gen_random_uniform() /// @brief Returns expression that returns pseudo random values of the given range template <typename T> -inline internal::expression_random_range<T> gen_random_range(const random_bit_generator& gen, T min, T max) +KFR_FUNCTION internal::expression_random_range<T> gen_random_range(const random_bit_generator& gen, T min, + T max) { return internal::expression_random_range<T>(gen, min, max); } - #ifndef KFR_DISABLE_READCYCLECOUNTER /// @brief Returns expression that returns pseudo random values of the given range template <typename T> -inline internal::expression_random_range<T> gen_random_range(T min, T max) +KFR_FUNCTION internal::expression_random_range<T> gen_random_range(T min, T max) { return internal::expression_random_range<T>(random_bit_generator(seed_from_rdtsc), min, max); } #endif +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp @@ -1,239 +0,0 @@ -/** @addtogroup types - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "shuffle.hpp" -#include "types.hpp" -#include "vec.hpp" - -namespace kfr -{ - -template <size_t N, bool A = false, typename T> -CMT_INLINE static vec<T, N> read(const T* src) -{ - return vec<T, N>(src, cbool_t<A>()); -} - -template <bool A = false, size_t N, typename T> -CMT_INLINE static void write(T* dest, const vec<T, N>& value) -{ - value.write(dest, cbool_t<A>()); -} - -template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)> -CMT_INLINE vec<T, Nout> gather(const T* base, size_t index, Indices... indices) -{ - return make_vector(base[index], base[indices]...); -} - -template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)> -CMT_INLINE vec<T, Nout> gather(const T* base) -{ - return make_vector(base[Index], base[Indices]...); -} - -template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0> -CMT_INLINE void scatter(const T* base, const vec<T, N>& value) -{ - base[Index] = value[InIndex]; - scatter<Indices..., T, N, InIndex + 1>(base, value); -} - -namespace internal -{ -template <typename T, size_t N, size_t... Indices> -CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>) -{ - return make_vector(base[indices[Indices]]...); -} -template <size_t Nout, size_t Stride, typename T, size_t... Indices> -CMT_INLINE vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>) -{ - return make_vector(base[Indices * Stride]...); -} -template <size_t Nout, size_t groupsize, typename T, size_t... Indices> -CMT_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>) -{ - return make_vector(read<groupsize>(base + Indices * groupsize * stride)...); -} -} // namespace internal - -template <typename T, size_t N> -CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices) -{ - return internal::gather(base, indices, csizeseq_t<N>()); -} - -template <size_t Nout, size_t groupsize = 1, typename T> -CMT_INLINE vec<T, Nout * groupsize> gather_stride(const T* base, size_t stride) -{ - return internal::gather_stride_s<Nout, groupsize>(base, stride, csizeseq_t<Nout>()); -} - -template <size_t Nout, size_t Stride, typename T> -CMT_INLINE vec<T, Nout> gather_stride(const T* base) -{ - return internal::gather_stride<Nout, Stride>(base, csizeseq_t<Nout>()); -} - -template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices> -CMT_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>) -{ - return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...); -} -template <size_t groupsize = 1, typename T, size_t N, typename IT> -CMT_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset) -{ - return gather_helper<groupsize>(base, offset, csizeseq_t<N>()); -} - -template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices> -CMT_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value, - csizes_t<Indices...>) -{ - swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)), - 0)... }; -} -template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, size_t... Indices> -CMT_INLINE void scatter_helper_s(T* base, size_t stride, const vec<T, Nout>& value, csizes_t<Indices...>) -{ - swallow{ (write(base + groupsize * stride, slice<Indices * groupsize, groupsize>(value)), 0)... }; -} -template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT> -CMT_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value) -{ - return scatter_helper<groupsize>(base, offset, value, csizeseq_t<N>()); -} - -template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT> -CMT_INLINE void scatter_stride(T* base, const vec<T, Nout>& value, size_t stride) -{ - return scatter_helper_s<groupsize>(base, stride, value, csizeseq_t<N>()); -} - -template <typename T, size_t groupsize = 1> -struct stride_pointer : public stride_pointer<const T, groupsize> -{ - template <size_t N> - void write(const vec<T, N>& val, csize_t<N> = csize_t<N>()) - { - kfr::scatter_stride<N, groupsize>(this->ptr, val); - } -}; - -template <typename T, size_t groupsize> -struct stride_pointer<const T, groupsize> -{ - const T* ptr; - const size_t stride; - - template <size_t N> - vec<T, N> read(csize_t<N> = csize_t<N>()) - { - return kfr::gather_stride<N, groupsize>(ptr, stride); - } -}; - -template <typename T> -constexpr T partial_masks[] = { constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - constants<T>::allones(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T(), - T() }; - -template <typename T, size_t N> -CMT_INLINE vec<T, N> partial_mask(size_t index) -{ - static_assert(N <= arraysize(partial_masks<T>) / 2, - "N must not be greater than half of partial_masks expression_array"); - return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index); -} -template <typename T, size_t N> -CMT_INLINE vec<T, N> partial_mask(size_t index, vec_t<T, N>) -{ - return partial_mask<T, N>(index); -} -} // namespace kfr diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp @@ -1,4 +1,4 @@ -/** @addtogroup expressions +/** @addtogroup array * @{ */ /* @@ -25,39 +25,41 @@ */ #pragma once +#include "../math/min_max.hpp" +#include "../simd/horizontal.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/operators.hpp" +#include "../simd/vec.hpp" #include "basic_expressions.hpp" -#include "function.hpp" -#include "horizontal.hpp" -#include "min_max.hpp" -#include "operators.hpp" -#include "vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ template <typename T> -CMT_INLINE T final_mean(T value, size_t size) +KFR_INTRINSIC T final_mean(T value, size_t size) { return value / T(size); } KFR_FN(final_mean) template <typename T> -CMT_INLINE T final_rootmean(T value, size_t size) +KFR_INTRINSIC T final_rootmean(T value, size_t size) { - return internal::builtin_sqrt(value / T(size)); + return builtin_sqrt(value / T(size)); } KFR_FN(final_rootmean) namespace internal { template <typename FinalFn, typename T, KFR_ENABLE_IF(is_callable<FinalFn, T, size_t>::value)> -CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t size, T value) +KFR_INTRINSIC auto reduce_call_final(FinalFn&& finalfn, size_t size, T value) { return finalfn(value, size); } template <typename FinalFn, typename T, KFR_ENABLE_IF(!is_callable<FinalFn, T, size_t>::value)> -CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value) +KFR_INTRINSIC auto reduce_call_final(FinalFn&& finalfn, size_t, T value) { return finalfn(value); } @@ -65,7 +67,7 @@ CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value) template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, KFR_ARCH_DEP> struct expression_reduce : output_expression { - constexpr static size_t width = platform<T>::vector_width * bitness_const(1, 2); + constexpr static size_t width = vector_width<T> * bitness_const(1, 2); using value_type = T; @@ -76,26 +78,29 @@ struct expression_reduce : output_expression } template <size_t N> - CMT_INLINE void operator()(coutput_t, size_t, const vec<T, N>& x) const + KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<T, N>& x) const { counter += N; process(x); } - CMT_INLINE T get() { return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); } + KFR_MEM_INTRINSIC T get() + { + return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); + } protected: void reset() { counter = 0; } - CMT_INLINE void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); } + KFR_MEM_INTRINSIC void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); } template <size_t N, KFR_ENABLE_IF(N < width)> - CMT_INLINE void process(const vec<T, N>& x) const + KFR_MEM_INTRINSIC void process(const vec<T, N>& x) const { value = combine(value, reducefn(transformfn(x), narrow<N>(value))); } template <size_t N, KFR_ENABLE_IF(N > width)> - CMT_INLINE void process(const vec<T, N>& x) const + KFR_MEM_INTRINSIC void process(const vec<T, N>& x) const { process(low(x)); process(high(x)); @@ -109,10 +114,11 @@ protected: }; } // namespace internal -template <typename ReduceFn, typename TransformFn = fn::pass_through, typename FinalFn = fn::pass_through, - typename E1, typename T = value_type_of<E1>> -KFR_SINTRIN T reduce(const E1& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(), - FinalFn&& finalfn = fn::pass_through()) +template <typename ReduceFn, typename TransformFn = fn_generic::pass_through, + typename FinalFn = fn_generic::pass_through, typename E1, typename T = value_type_of<E1>> +KFR_INTRINSIC T reduce(const E1& e1, ReduceFn&& reducefn, + TransformFn&& transformfn = fn_generic::pass_through(), + FinalFn&& finalfn = fn_generic::pass_through()) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); using reducer_t = internal::expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>; @@ -134,7 +140,7 @@ KFR_FN(reduce) * \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T sum(const E1& x) +KFR_INTRINSIC T sum(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::add()); @@ -149,10 +155,10 @@ KFR_SINTRIN T sum(const E1& x) * \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T mean(const E1& x) +KFR_INTRINSIC T mean(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(x, fn::add(), fn::pass_through(), fn::final_mean()); + return reduce(x, fn::add(), fn_generic::pass_through(), fn::final_mean()); } /** @@ -161,7 +167,7 @@ KFR_SINTRIN T mean(const E1& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T minof(const E1& x) +KFR_INTRINSIC T minof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::min()); @@ -173,7 +179,7 @@ KFR_SINTRIN T minof(const E1& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T maxof(const E1& x) +KFR_INTRINSIC T maxof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::max()); @@ -185,7 +191,7 @@ KFR_SINTRIN T maxof(const E1& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T absminof(const E1& x) +KFR_INTRINSIC T absminof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::absmin()); @@ -197,7 +203,7 @@ KFR_SINTRIN T absminof(const E1& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T absmaxof(const E1& x) +KFR_INTRINSIC T absmaxof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::absmax()); @@ -214,7 +220,7 @@ KFR_SINTRIN T absmaxof(const E1& x) template <typename E1, typename E2, typename T = value_type_of<decltype(std::declval<E1>() * std::declval<E2>())>, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_SINTRIN T dotproduct(E1&& x, E2&& y) +KFR_INTRINSIC T dotproduct(E1&& x, E2&& y) { auto m = std::forward<E1>(x) * std::forward<E2>(y); using E12 = decltype(m); @@ -231,7 +237,7 @@ KFR_SINTRIN T dotproduct(E1&& x, E2&& y) \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T rms(const E1& x) +KFR_INTRINSIC T rms(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::add(), fn::sqr(), fn::final_rootmean()); @@ -246,7 +252,7 @@ KFR_SINTRIN T rms(const E1& x) \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T sumsqr(const E1& x) +KFR_INTRINSIC T sumsqr(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::add(), fn::sqr()); @@ -261,9 +267,11 @@ KFR_SINTRIN T sumsqr(const E1& x) \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T product(const E1& x) +KFR_INTRINSIC T product(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); return reduce(x, fn::mul()); } + +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp @@ -1,158 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/round.hpp" - -namespace kfr -{ - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 floor(const T1& x) -{ - return intrinsics::floor(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::floor, E1> floor(E1&& x) -{ - return { fn::floor(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 ceil(const T1& x) -{ - return intrinsics::ceil(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::ceil, E1> ceil(E1&& x) -{ - return { fn::ceil(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 round(const T1& x) -{ - return intrinsics::round(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::round, E1> round(E1&& x) -{ - return { fn::round(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 trunc(const T1& x) -{ - return intrinsics::trunc(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::trunc, E1> trunc(E1&& x) -{ - return { fn::trunc(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 fract(const T1& x) -{ - return intrinsics::fract(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::fract, E1> fract(E1&& x) -{ - return { fn::fract(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN itype<T1> ifloor(const T1& x) -{ - return intrinsics::ifloor(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::ifloor, E1> ifloor(E1&& x) -{ - return { fn::ifloor(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN itype<T1> iceil(const T1& x) -{ - return intrinsics::iceil(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::iceil, E1> iceil(E1&& x) -{ - return { fn::iceil(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN itype<T1> iround(const T1& x) -{ - return intrinsics::iround(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::iround, E1> iround(E1&& x) -{ - return { fn::iround(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN itype<T1> itrunc(const T1& x) -{ - return intrinsics::itrunc(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::itrunc, E1> itrunc(E1&& x) -{ - return { fn::itrunc(), std::forward<E1>(x) }; -} - -template <typename T, KFR_ENABLE_IF(is_f_class<T>::value)> -CMT_INLINE T fmod(const T& x, const T& y) -{ - return x - trunc(x / y) * y; -} -KFR_FN(fmod) - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -constexpr CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y) -{ - return x % y; -} -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y) -{ - return fmod(x, y); -} -} // namespace kfr diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp @@ -1,62 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/saturation.hpp" - -namespace kfr -{ - -/// @brief Adds two arguments using saturation -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout satadd(const T1& x, const T2& y) -{ - return intrinsics::satadd(x, y); -} - -/// @brief Creates an expression that adds two arguments using saturation -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::satadd, E1, E2> satadd(E1&& x, E2&& y) -{ - return { fn::satadd(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -/// @brief Subtracts two arguments using saturation -template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), - typename Tout = common_type<T1, T2>> -KFR_INTRIN Tout satsub(const T1& x, const T2& y) -{ - return intrinsics::satsub(x, y); -} - -/// @brief Creates an expression that subtracts two arguments using saturation -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -KFR_INTRIN internal::expression_function<fn::satsub, E1, E2> satsub(E1&& x, E2&& y) -{ - return { fn::satsub(), std::forward<E1>(x), std::forward<E2>(y) }; -} -} // namespace kfr diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -1,57 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/select.hpp" - -namespace kfr -{ - -/** - * @brief Returns x if m is true, otherwise return y. Order of the arguments is same as in ternary operator. - * @code - * return m ? x : y - * @endcode - */ -template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value), - typename Tout = subtype<common_type<T2, T3>>> -KFR_INTRIN vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y) -{ - static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types"); - return intrinsics::select(bitcast<Tout>(m.asvec()).asmask(), static_cast<vec<Tout, N>>(x), - static_cast<vec<Tout, N>>(y)); -} - -/** - * @brief Returns template expression that returns x if m is true, otherwise return y. Order of the arguments - * is same as in ternary operator. - */ -template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> -KFR_INTRIN internal::expression_function<fn::select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y) -{ - return { fn::select(), std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) }; -} -} // namespace kfr diff --git a/include/kfr/base/shuffle.hpp b/include/kfr/base/shuffle.hpp @@ -1,625 +0,0 @@ -/** @addtogroup shuffle - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "constants.hpp" -#include "expression.hpp" -#include "types.hpp" -#include "vec.hpp" - -#include <utility> - -namespace kfr -{ - -namespace internal -{ - -template <typename T, typename... Ts, size_t... indices, size_t Nin = sizeof...(Ts), - size_t Nout = sizeof...(indices)> -CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast_helper(csizes_t<indices...>, const Ts&... values) -{ - const std::tuple<Ts...> tup(values...); - return vec<T, Nout>(std::get<indices % Nin>(tup)...); -} -} // namespace internal - -template <size_t Nout, typename... Ts, typename C = typename std::common_type<Ts...>::type> -CMT_GNU_CONSTEXPR CMT_INLINE vec<C, Nout> broadcast(const Ts&... values) -{ - return internal::broadcast_helper<C>(csizeseq_t<Nout>(), values...); -} -KFR_FN(broadcast) - -template <size_t Ncount, typename T, size_t N> -CMT_INLINE vec<T, N + Ncount> padhigh(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<N + Ncount>()); -} -KFR_FN(padhigh) - -template <size_t Ncount, typename T, size_t N> -CMT_INLINE vec<T, N + Ncount> padlow(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<N + Ncount, 0 - Ncount>()); -} -KFR_FN(padlow) - -template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)> -CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<Nout>()); -} -template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)> -constexpr CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x) -{ - return x; -} -KFR_FN(extend) - -template <size_t start, size_t count, typename T, size_t N> -CMT_INLINE vec<T, count> slice(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<count, start>()); -} -template <size_t start, size_t count, typename T, size_t N> -CMT_INLINE vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y) -{ - return x.shuffle(y, csizeseq_t<count, start>()); -} -KFR_FN(slice) - -template <size_t start, size_t count, typename T, size_t N> -CMT_INLINE vec<T, N> replace(const vec<T, N>& x, const vec<T, N>& y) -{ - return x.shuffle( - y, csizeseq_t<N>() + - (csizeseq_t<N>() >= csize_t<start>() && csizeseq_t<N>() < csize_t<start + count>()) * N); -} -KFR_FN(replace) - -template <size_t, typename T, size_t N> -CMT_INLINE void split(const vec<T, N>&) -{ -} -template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args> -CMT_INLINE void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args) -{ - out = x.shuffle(csizeseq_t<Nout, start>()); - split<start + Nout>(x, std::forward<Args>(args)...); -} -template <typename T, size_t N> -CMT_INLINE void split(const vec<T, N>& x, vec<T, N / 2>& low, vec<T, N / 2>& high) -{ - low = x.shuffle(csizeseq_t<N / 2, 0>()); - high = x.shuffle(csizeseq_t<N / 2, N / 2>()); -} -template <typename T, size_t N> -CMT_INLINE void split(const vec<T, N>& x, vec<T, N / 4>& w0, vec<T, N / 4>& w1, vec<T, N / 4>& w2, - vec<T, N / 4>& w3) -{ - w0 = x.shuffle(csizeseq_t<N / 4, 0>()); - w1 = x.shuffle(csizeseq_t<N / 4, N / 4>()); - w2 = x.shuffle(csizeseq_t<N / 4, 2 * N / 4>()); - w3 = x.shuffle(csizeseq_t<N / 4, 3 * N / 4>()); -} -KFR_FN(split) - -template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total> -CMT_INLINE vec<T, Nout> part(const vec<T, N>& x) -{ - static_assert(N % total == 0, "N % total == 0"); - return x.shuffle(csizeseq_t<Nout, number * Nout>()); -} -KFR_FN(part) - -template <size_t start, size_t count, typename T, size_t N> -CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N>& x, const vec<T, N>& y) -{ - return x.shuffle(y, csizeseq_t<count, start>()); -} - -template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)> -CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<count, start>()); -} - -template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)> -CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return x.shuffle(csizeseq_t<N2, -(N2 - N1)>()) - .shuffle(y, csizeseq_t<N2 * 2>()) - .shuffle(csizeseq_t<count, N2 - N1 + start>()); -} - -KFR_FN(concat_and_slice) - -template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout > N)> -CMT_INLINE vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T()) -{ - static_assert(Nout > N, "Nout > N"); - return concat(x, broadcast<Nout - N>(newvalue)); -} -template <size_t Nout, typename T, typename TS> -constexpr CMT_INLINE vec<T, Nout> widen(const vec<T, Nout>& x, TS) -{ - return x; -} -KFR_FN(widen) - -template <size_t Nout, typename T, size_t N> -CMT_INLINE vec<T, Nout> narrow(const vec<T, N>& x) -{ - static_assert(Nout <= N, "Nout <= N"); - return slice<0, Nout>(x); -} -KFR_FN(narrow) - -template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)> -CMT_INLINE vec<T, Nout> even(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<Nout / group, 0, 2>())); -} -KFR_FN(even) - -template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)> -CMT_INLINE vec<T, Nout> odd(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<Nout / group, 1, 2>())); -} -KFR_FN(odd) - -namespace internal -{ -template <size_t groupsize = 2> -struct shuffle_index_dup1 -{ - constexpr inline size_t operator()(size_t index) const { return index / groupsize; } -}; - -template <size_t groupsize = 2, size_t start = 0> -struct shuffle_index_dup -{ - constexpr inline size_t operator()(size_t index) const { return start + index / groupsize * groupsize; } -}; -} // namespace internal - -template <typename T, size_t N> -CMT_INLINE vec<T, N> dupeven(const vec<T, N>& x) -{ - static_assert(N % 2 == 0, "N must be even"); - return x.shuffle(csizeseq_t<N, 0, 1>() & ~csize_t<1>()); -} -KFR_FN(dupeven) - -template <typename T, size_t N> -CMT_INLINE vec<T, N> dupodd(const vec<T, N>& x) -{ - static_assert(N % 2 == 0, "N must be even"); - return x.shuffle(csizeseq_t<N, 0, 1>() | csize_t<1>()); -} -KFR_FN(dupodd) - -template <typename T, size_t N> -CMT_INLINE vec<T, N * 2> duphalfs(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<N * 2>() % csize_t<N>()); -} -KFR_FN(duphalfs) - -namespace internal -{ -template <size_t size, size_t... Indices> -struct shuffle_index_shuffle -{ - constexpr static size_t indexcount = sizeof...(Indices); - - template <size_t index> - constexpr inline size_t operator()(csize_t<index>) const - { - return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount; - } -}; -} // namespace internal - -template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> -CMT_INLINE vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y, - elements_t<Indices...> i = elements_t<Indices...>()) -{ - return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] + - csizeseq_t<N>() / csize_t<count>() * csize_t<count>()); -} -KFR_FN(shuffle) - -template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> -CMT_INLINE vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y, - elements_t<Indices...> i = elements_t<Indices...>()) -{ - return x.shuffle(y, scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] + - csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>())); -} -KFR_FN(shufflegroups) - -namespace internal -{ -template <size_t size, size_t... Indices> -struct shuffle_index_permute -{ - constexpr static size_t indexcount = sizeof...(Indices); - - template <size_t index> - constexpr inline size_t operator()(csize_t<index>) const - { - return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount; - } -}; -} // namespace internal - -template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> -CMT_INLINE vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>()) -{ - return x.shuffle(i[csizeseq_t<N>() % csize_t<count>()] + - csizeseq_t<N>() / csize_t<count>() * csize_t<count>()); -} -KFR_FN(permute) - -template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> -CMT_INLINE vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>()) -{ - return x.shuffle(scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] + - csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>())); -} -KFR_FN(permutegroups) - -namespace internal -{ - -template <typename T, size_t Nout, typename Fn, size_t... Indices> -constexpr CMT_INLINE vec<T, Nout> generate_vector(csizes_t<Indices...>) -{ - return make_vector(static_cast<T>(Fn()(Indices))...); -} -} // namespace internal - -template <typename T, size_t Nout, typename Fn> -constexpr CMT_INLINE vec<T, Nout> generate_vector() -{ - return internal::generate_vector<T, Nout, Fn>(cvalseq_t<size_t, Nout>()); -} -KFR_FN(generate_vector) - -namespace internal -{ -template <typename T, size_t N> -constexpr CMT_INLINE mask<T, N> evenmask() -{ - return broadcast<N, T>(maskbits<T>(true), maskbits<T>(false)); -} -template <typename T, size_t N> -constexpr CMT_INLINE mask<T, N> oddmask() -{ - return broadcast<N, T>(maskbits<T>(false), maskbits<T>(true)); -} -} // namespace internal - -template <typename T, size_t N, size_t Nout = N * 2> -CMT_INLINE vec<T, Nout> dup(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<Nout>() / csize_t<2>()); -} -KFR_FN(dup) - -namespace internal -{ -template <size_t count, size_t start = 0> -struct shuffle_index_duphalf -{ - constexpr inline size_t operator()(size_t index) const { return start + (index) % count; } -}; -} // namespace internal - -template <typename T, size_t N> -CMT_INLINE vec<T, N> duplow(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>()); -} -KFR_FN(duplow) - -template <typename T, size_t N> -CMT_INLINE vec<T, N> duphigh(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>() + csize_t<N - N / 2>()); -} -KFR_FN(duphigh) - -namespace internal -{ -template <size_t size, size_t... Indices> -struct shuffle_index_blend -{ - constexpr static size_t indexcount = sizeof...(Indices); - - template <size_t index> - constexpr inline size_t operator()(csize_t<index>) const - { - return (elements_t<Indices...>::get(csize_t<index % indexcount>()) ? size : 0) + index % size; - } -}; -} // namespace internal - -template <size_t... Indices, typename T, size_t N> -CMT_INLINE vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y, - elements_t<Indices...> i = elements_t<Indices...>()) -{ - return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] * csize_t<N>() + csizeseq_t<N>()); -} -KFR_FN(blend) - -namespace internal -{ -template <size_t elements> -struct shuffle_index_swap -{ - constexpr inline size_t operator()(size_t index) const - { - static_assert(is_poweroftwo(elements), "is_poweroftwo( elements )"); - return index ^ (elements - 1); - } -}; -template <size_t amount, size_t N> -struct shuffle_index_outputright -{ - constexpr inline size_t operator()(size_t index) const - { - return index < N - amount ? index : index + amount; - } -}; -} // namespace internal - -template <size_t elements = 2, typename T, size_t N> -CMT_INLINE vec<T, N> swap(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<N>() ^ csize_t<elements - 1>()); -} -CMT_FN_TPL((size_t elements), (elements), swap) - -template <size_t shift, typename T, size_t N> -CMT_INLINE vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi) -{ - return shift == 0 ? lo : (shift == N ? hi : hi.shuffle(lo, csizeseq_t<N, N - shift>())); -} - -template <size_t amount, typename T, size_t N> -CMT_INLINE vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>()) -{ - static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N"); - return x.shuffle(csizeseq_t<N, N - amount>() % csize_t<N>()); -} -KFR_FN(rotateright) - -template <size_t amount, typename T, size_t N> -CMT_INLINE vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>()) -{ - static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N"); - return x.shuffle(csizeseq_t<N, amount>() % csize_t<N>()); -} -KFR_FN(rotateleft) - -template <typename T, size_t N> -CMT_INLINE vec<T, N> insertright(T x, const vec<T, N>& y) -{ - return concat_and_slice<1, N>(y, vec<T, 1>(x)); -} -KFR_FN(insertright) - -template <typename T, size_t N> -CMT_INLINE vec<T, N> insertleft(T x, const vec<T, N>& y) -{ - return concat_and_slice<0, N>(vec<T, 1>(x), y); -} -KFR_FN(insertleft) - -// template <typename T, size_t N, size_t N2> -// CMT_INLINE vec<T, N> outputright(const vec<T, N>& x, const vec<T, N2>& y) -//{ -// return shufflevector<N, internal::shuffle_index_outputright<N2, N>>(x, extend<N>(y)); -//} -// KFR_FN(outputright) - -namespace internal -{ -template <size_t size, size_t side1> -struct shuffle_index_transpose -{ - constexpr inline size_t operator()(size_t index) const - { - return index % (size / side1) * side1 + index / (size / side1); - } -}; -} // namespace internal - -template <size_t side1, size_t group = 1, typename T, size_t N, size_t size = N / group, - size_t side2 = size / side1, KFR_ENABLE_IF(size > 3)> -CMT_INLINE vec<T, N> transpose(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + - csizeseq_t<size>() / csize_t<side2>())); -} -template <size_t side, size_t group = 1, typename T, size_t N, KFR_ENABLE_IF(N / group <= 3)> -CMT_INLINE vec<T, N> transpose(const vec<T, N>& x) -{ - return x; -} -template <typename T, size_t N> -CMT_INLINE vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x) -{ - return vec<vec<T, N>, N>(transpose<2>(x.flatten())); -} -KFR_FN(transpose) - -template <size_t side2, size_t group = 1, typename T, size_t N, size_t size = N / group, - size_t side1 = size / side2, KFR_ENABLE_IF(size > 3)> -CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + - csizeseq_t<size>() / csize_t<side2>())); -} -template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)> -CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x) -{ - return x; -} -KFR_FN(transposeinverse) - -template <size_t side, typename T, size_t N> -CMT_INLINE vec<T, N> ctranspose(const vec<T, N>& x) -{ - return transpose<side, 2>(x); -} -KFR_FN(ctranspose) - -template <size_t side, typename T, size_t N> -CMT_INLINE vec<T, N> ctransposeinverse(const vec<T, N>& x) -{ - return transposeinverse<side, 2>(x); -} -KFR_FN(ctransposeinverse) - -template <size_t group = 1, typename T, size_t N, size_t Nout = N * 2, size_t size = Nout / group, - size_t side2 = 2, size_t side1 = size / side2> -CMT_INLINE vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y) -{ - return x.shuffle(y, scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + - csizeseq_t<size>() / csize_t<side2>())); -} -KFR_FN(interleave) - -template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> -CMT_INLINE internal::expression_function<fn::interleave, E1, E2> interleave(E1&& x, E2&& y) -{ - return { fn::interleave(), std::forward<E1>(x), std::forward<E2>(y) }; -} - -template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side2 = 2, - size_t side1 = size / side2> -CMT_INLINE vec<T, N> interleavehalfs(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + - csizeseq_t<size>() / csize_t<side2>())); -} -KFR_FN(interleavehalfs) - -template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side1 = 2, - size_t side2 = size / side1> -CMT_INLINE vec<T, N> splitpairs(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + - csizeseq_t<size>() / csize_t<side2>())); -} -KFR_FN(splitpairs) - -namespace internal -{ -template <size_t size> -struct shuffle_index_reverse -{ - constexpr inline size_t operator()(size_t index) const { return size - 1 - index; } -}; -} // namespace internal - -template <size_t group = 1, typename T, size_t N, size_t size = N / group> -CMT_INLINE vec<T, N> reverse(const vec<T, N>& x) -{ - return x.shuffle(scale<group>(csizeseq_t<size, size - 1, -1>())); -} -template <typename T, size_t N1, size_t N2> -CMT_INLINE vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x) -{ - return vec<vec<T, N1>, N2>(swap<N1>(x.flatten())); -} -KFR_FN(reverse) - -namespace internal -{ -template <size_t N1, size_t N2> -struct shuffle_index_combine -{ - constexpr inline size_t operator()(size_t index) const { return index >= N2 ? index : N1 + index; } -}; -} // namespace internal - -template <typename T, size_t N1, size_t N2> -CMT_INLINE vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y) -{ - static_assert(N2 <= N1, "N2 <= N1"); - return x.shuffle(extend<N1>(y), (csizeseq_t<N1>() < csize_t<N2>()) * csize_t<N1>() + csizeseq_t<N1>()); - // return shufflevector<N1, internal::shuffle_index_combine<N1, N2>>(x, extend<N1>(y)); -} -KFR_FN(combine) - -namespace internal -{ -template <size_t start, size_t stride> -struct generate_index -{ - CMT_INLINE constexpr size_t operator()(size_t index) const { return start + index * stride; } -}; -template <size_t start, size_t size, int on, int off> -struct generate_onoff -{ - CMT_INLINE constexpr size_t operator()(size_t index) const - { - return index >= start && index < start + size ? on : off; - } -}; -} // namespace internal - -template <typename T, size_t N, size_t start = 0, size_t stride = 1> -constexpr CMT_INLINE vec<T, N> enumerate() -{ - return generate_vector<T, N, internal::generate_index<start, stride>>(); -} -template <size_t start = 0, size_t stride = 1, typename T, size_t N> -constexpr CMT_INLINE vec<T, N> enumerate(vec_t<T, N>) -{ - return generate_vector<T, N, internal::generate_index<start, stride>>(); -} -KFR_FN(enumerate) - -template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0> -constexpr CMT_INLINE vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>()) -{ - return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>(); -} -template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N> -constexpr CMT_INLINE vec<T, N> onoff(vec_t<T, N>, cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>()) -{ - return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>(); -} -KFR_FN(onoff) -} // namespace kfr -#define KFR_SHUFFLE_SPECIALIZATIONS 1 -#include "specializations.i" diff --git a/include/kfr/base/simd_clang.hpp b/include/kfr/base/simd_clang.hpp @@ -1,350 +0,0 @@ -/** @addtogroup types - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "kfr.h" -#include "platform.hpp" -#include "types.hpp" - -#if CMT_COMPILER_CLANG - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4324)) - -namespace kfr -{ - -template <typename T, size_t... Ns> -constexpr vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept; - -#define KFR_NATIVE_INTRINSICS 1 - -namespace internal -{ -template <typename TT, size_t NN> -using simd_type = TT __attribute__((ext_vector_type(NN))); - -template <typename T, size_t N, bool A> -using simd_storage = internal::struct_with_alignment<simd_type<T, N>, A>; - -template <typename T, size_t N, size_t... indices> -CMT_INLINE simd_type<T, sizeof...(indices)> simd_shuffle(const simd_type<T, N>& x, const simd_type<T, N>& y, - csizes_t<indices...>) -{ - return __builtin_shufflevector(x, y, ((indices >= N * 2) ? -1 : static_cast<int>(indices))...); -} -template <typename T, size_t N, size_t... indices> -CMT_INLINE simd_type<T, sizeof...(indices)> simd_shuffle(const simd_type<T, N>& x, csizes_t<indices...>) -{ - return __builtin_shufflevector(x, x, ((indices >= N) ? -1 : static_cast<int>(indices))...); -} - -template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> -CMT_INLINE simd_type<T, N> simd_read(const T* src) -{ - return ptr_cast<simd_storage<T, N, A>>(src)->value; -} - -template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> -CMT_INLINE simd_type<T, N> simd_read(const T* src) -{ - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - constexpr auto extend_indices = - cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>()); - constexpr auto concat_indices = cvalseq_t<size_t, N>(); - return simd_shuffle<T, first>(simd_read<first, A>(src), - simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices), - concat_indices); -} - -template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> -CMT_INLINE void simd_write(T* dest, const simd_type<T, N>& value) -{ - ptr_cast<simd_storage<T, N, A>>(dest)->value = value; -} - -template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> -CMT_INLINE void simd_write(T* dest, const simd_type<T, N>& value) -{ - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - simd_write<A, first>(dest, simd_shuffle(value, csizeseq_t<first>())); - simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq_t<rest, first>())); -} -} // namespace internal - -template <typename T, size_t N> -struct alignas(alignof(internal::simd_type<T, N>)) vec : public vec_t<T, N> -{ - static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type"); - - // type and size - using value_type = T; - constexpr static size_t size() noexcept { return N; } - - using scalar_type = T; - constexpr static size_t scalar_size() noexcept { return N; } - - using mask_t = mask<T, N>; - - using simd_type = internal::simd_type<T, N>; - using uvalue_type = utype<T>; - using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>; - using usimd_type = internal::simd_type<uvalue_type, N>; - using iusimd_type = internal::simd_type<iuvalue_type, N>; - - // constructors and assignment - // default - constexpr vec() noexcept = default; - // copy - constexpr vec(const vec&) noexcept = default; - // assignment - constexpr vec& operator=(const vec&) noexcept = default; - // from scalar - template <typename U, typename = enable_if<(std::is_convertible<U, value_type>::value)>> - constexpr vec(const U& s) noexcept : simd(s) - { - } - // from list - template <typename... Us> - constexpr vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept - : simd{ s0, s1, static_cast<value_type>(rest)... } - { - } - // from vector of another type - template <typename U, typename = enable_if<is_simd_type<U>::value>> - constexpr vec(const vec<U, N>& v) noexcept : simd(__builtin_convertvector(v.simd, simd_type)) - { - } - constexpr vec(const simd_type& simd) noexcept : simd(simd) {} - // from list of vectors - template <size_t... Ns, typename = enable_if<csum<size_t, Ns...>() == N>> - constexpr vec(const vec<T, Ns>&... vs) noexcept : simd(*concat(vs...)) - { - } - constexpr vec(czeros_t) noexcept : simd(0) {} - constexpr vec(cones_t) noexcept : simd(*(vec() == vec())) {} - - template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)> - constexpr static vec frombits(const vec<U, M>& v) noexcept - { - return (simd_type)(v.flatten().simd); - } - -#define KFR_U(x) ((usimd_type)(x)) -#define KFR_IU(x) ((iusimd_type)(x)) -#define KFR_S(x) ((simd_type)(x)) - - // math / bitwise / comparison operators - constexpr friend vec operator+(const vec& x) noexcept { return x; } - constexpr friend vec operator-(const vec& x) noexcept { return KFR_S(-*x); } - constexpr friend vec operator~(const vec& x) noexcept { return KFR_S(~KFR_U(*x)); } - - constexpr friend vec operator+(const vec& x, const vec& y) noexcept { return *x + *y; } - constexpr friend vec operator-(const vec& x, const vec& y) noexcept { return *x - *y; } - constexpr friend vec operator*(const vec& x, const vec& y) noexcept { return *x * *y; } - constexpr friend vec operator/(const vec& x, const vec& y) noexcept { return *x / *y; } - - constexpr friend vec operator<<(const vec& x, int shift) noexcept { return KFR_S(KFR_IU(*x) << shift); } - constexpr friend vec operator>>(const vec& x, int shift) noexcept { return KFR_S(KFR_IU(*x) >> shift); } - constexpr friend vec operator&(const vec& x, const vec& y) noexcept - { - return KFR_S(KFR_U(*x) & KFR_U(*y)); - } - constexpr friend vec operator|(const vec& x, const vec& y) noexcept - { - return KFR_S(KFR_U(*x) | KFR_U(*y)); - } - constexpr friend vec operator^(const vec& x, const vec& y) noexcept - { - return KFR_S(KFR_U(*x) ^ KFR_U(*y)); - } - - constexpr friend mask_t operator==(const vec& x, const vec& y) noexcept { return KFR_S(*x == *y); } - constexpr friend mask_t operator!=(const vec& x, const vec& y) noexcept { return KFR_S(*x != *y); } - constexpr friend mask_t operator<(const vec& x, const vec& y) noexcept { return KFR_S(*x < *y); } - constexpr friend mask_t operator>(const vec& x, const vec& y) noexcept { return KFR_S(*x > *y); } - constexpr friend mask_t operator<=(const vec& x, const vec& y) noexcept { return KFR_S(*x <= *y); } - constexpr friend mask_t operator>=(const vec& x, const vec& y) noexcept { return KFR_S(*x >= *y); } - - constexpr mask_t asmask() const noexcept { return mask_t(simd); } - -#undef KFR_S -#undef KFR_U - - constexpr friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; } - constexpr friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; } - constexpr friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; } - constexpr friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; } - - constexpr friend vec& operator<<=(vec& x, int shift) noexcept { return x = x << shift; } - constexpr friend vec& operator>>=(vec& x, int shift) noexcept { return x = x >> shift; } - constexpr friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; } - constexpr friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; } - constexpr friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; } - - constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); } - constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); } - constexpr friend vec operator++(vec& x, int) noexcept - { - const vec z = x; - ++x; - return z; - } - constexpr friend vec operator--(vec& x, int) noexcept - { - const vec z = x; - --x; - return z; - } - - // shuffle - template <size_t... indices> - vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept - { - return __builtin_shufflevector(simd, simd, (indices >= N ? -1 : int(indices))...); - } - template <size_t... indices> - vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept - { - return __builtin_shufflevector(simd, y.simd, (indices >= N * 2 ? -1 : int(indices))...); - } - - // element access - struct element; - constexpr value_type operator[](size_t index) const& noexcept { return get(index); } - constexpr value_type operator[](size_t index) && noexcept { return get(index); } - constexpr element operator[](size_t index) & noexcept { return { *this, index }; } - - constexpr value_type get(size_t index) const noexcept { return simd[index]; } - constexpr void set(size_t index, const value_type& s) noexcept { simd[index] = s; } - template <size_t index> - constexpr value_type get(csize_t<index>) const noexcept - { - return simd[index]; - } - template <size_t index> - constexpr void set(csize_t<index>, const value_type& s) noexcept - { - simd[index] = s; - } - struct element - { - constexpr operator value_type() const noexcept { return v.get(index); } - element& operator=(const value_type& s) noexcept - { - v.set(index, s); - return *this; - } - element& operator=(const element& s) noexcept - { - v.set(index, static_cast<value_type>(s)); - return *this; - } - template <typename U, size_t M> - element& operator=(const typename vec<U, M>::element& s) noexcept - { - v.set(index, static_cast<value_type>(static_cast<U>(s))); - return *this; - } - vec& v; - size_t index; - }; - - // read/write - template <bool aligned = false> - explicit constexpr vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept - : simd(internal::simd_read<N, aligned>(src)) - { - } - template <bool aligned = false> - const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept - { - internal::simd_write<aligned, N>(dest, simd); - return *this; - } - - // native SIMD type access - const vec& flatten() const noexcept { return *this; } - simd_type operator*() const noexcept { return simd; } - simd_type& operator*() noexcept { return simd; } - -protected: - template <typename U, size_t M> - friend struct vec; - - simd_type simd; - -private: -}; - -namespace internal -{ -template <typename T, size_t N> -CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x) -{ - return x; -} - -template <typename T, size_t N> -CMT_INLINE vec<T, N * 2> concat_impl(const vec<T, N>& x, const vec<T, N>& y) -{ - return x.shuffle(y, csizeseq_t<N * 2>()); -} - -template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)> -CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<N1 + N2>()); -} - -template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)> -CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return x.shuffle(csizeseq_t<N2, -(N2 - N1)>()) - .shuffle(y, csizeseq_t<N2 * 2>()) - .shuffle(csizeseq_t<N1 + N2, N2 - N1>()); -} - -template <typename T, size_t N1, size_t N2, size_t... Sizes> -CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y, - const vec<T, Sizes>&... args) -{ - return concat_impl(concat_impl(x, y), args...); -} -} // namespace internal - -template <typename T, size_t... Ns> -constexpr inline vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept -{ - return internal::concat_impl(vs...); -} -} // namespace kfr - -CMT_PRAGMA_MSVC(warning(pop)) - -#endif diff --git a/include/kfr/base/simd_intrin.hpp b/include/kfr/base/simd_intrin.hpp @@ -1,392 +0,0 @@ -/** @addtogroup types - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "kfr.h" - -#include "constants.hpp" -#include "platform.hpp" -#include "types.hpp" - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4324)) -CMT_PRAGMA_MSVC(warning(disable : 4814)) - -#ifdef CMT_INTRINSICS_IS_CONSTEXPR -#define KFR_I_CE constexpr -#else -#define KFR_I_CE -#endif - -namespace kfr -{ - -template <typename T, size_t... Ns> -constexpr vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept; - -#define KFR_NATIVE_INTRINSICS 1 - -template <typename T, size_t N> -struct simd_type_holder -{ - using type = struct - { - T v[N]; - }; -}; - -#define KFR_SIMD_SPEC_TYPE(T, N, MM) \ - template <> \ - struct simd_type_holder<T, N> \ - { \ - using type = MM; \ - }; - -#ifdef CMT_ARCH_SSE2 -KFR_SIMD_SPEC_TYPE(u8, 16, __m128i); -KFR_SIMD_SPEC_TYPE(u16, 8, __m128i); -KFR_SIMD_SPEC_TYPE(u32, 4, __m128i); -KFR_SIMD_SPEC_TYPE(u64, 2, __m128i); -KFR_SIMD_SPEC_TYPE(i8, 16, __m128i); -KFR_SIMD_SPEC_TYPE(i16, 8, __m128i); -KFR_SIMD_SPEC_TYPE(i32, 4, __m128i); -KFR_SIMD_SPEC_TYPE(i64, 2, __m128i); -KFR_SIMD_SPEC_TYPE(f32, 4, __m128); -KFR_SIMD_SPEC_TYPE(f64, 2, __m128d); -#endif - -#ifdef CMT_ARCH_AVX -KFR_SIMD_SPEC_TYPE(u8, 32, __m256i); -KFR_SIMD_SPEC_TYPE(u16, 16, __m256i); -KFR_SIMD_SPEC_TYPE(u32, 8, __m256i); -KFR_SIMD_SPEC_TYPE(u64, 4, __m256i); -KFR_SIMD_SPEC_TYPE(i8, 32, __m256i); -KFR_SIMD_SPEC_TYPE(i16, 16, __m256i); -KFR_SIMD_SPEC_TYPE(i32, 8, __m256i); -KFR_SIMD_SPEC_TYPE(i64, 4, __m256i); -KFR_SIMD_SPEC_TYPE(f32, 8, __m256); -KFR_SIMD_SPEC_TYPE(f64, 4, __m256d); -#endif - -#ifdef CMT_ARCH_AVX512 -KFR_SIMD_SPEC_TYPE(u8, 64, __m512i); -KFR_SIMD_SPEC_TYPE(u16, 32, __m512i); -KFR_SIMD_SPEC_TYPE(u32, 16, __m512i); -KFR_SIMD_SPEC_TYPE(u64, 8, __m512i); -KFR_SIMD_SPEC_TYPE(i8, 64, __m512i); -KFR_SIMD_SPEC_TYPE(i16, 32, __m512i); -KFR_SIMD_SPEC_TYPE(i32, 16, __m512i); -KFR_SIMD_SPEC_TYPE(i64, 8, __m512i); -KFR_SIMD_SPEC_TYPE(f32, 16, __m512); -KFR_SIMD_SPEC_TYPE(f64, 8, __m512d); -#endif - -#ifdef CMT_ARCH_NEON -KFR_SIMD_SPEC_TYPE(u8, 16, uint8x16_t); -KFR_SIMD_SPEC_TYPE(u16, 8, uint16x8_t); -KFR_SIMD_SPEC_TYPE(u32, 4, uint32x4_t); -KFR_SIMD_SPEC_TYPE(u64, 2, uint64x2_t); -KFR_SIMD_SPEC_TYPE(i8, 16, int8x16_t); -KFR_SIMD_SPEC_TYPE(i16, 8, int16x8_t); -KFR_SIMD_SPEC_TYPE(i32, 4, int32x4_t); -KFR_SIMD_SPEC_TYPE(i64, 2, int64x2_t); -KFR_SIMD_SPEC_TYPE(f32, 4, float32x4_t); -#ifdef CMT_ARCH_NEON64 -KFR_SIMD_SPEC_TYPE(f64, 2, float64x2_t); -#endif -#endif - -template <size_t N> -struct raw_bytes -{ - u8 raw[N]; -}; - -#define KFR_CYCLE(...) \ - for (size_t i = 0; i < N; i++) \ - __VA_ARGS__ - -#define KFR_C_CYCLE(...) \ - for (size_t i = 0; i < N; i++) \ - vs[i] = __VA_ARGS__ - -#define KFR_R_CYCLE(...) \ - vec<T, N> result; \ - for (size_t i = 0; i < N; i++) \ - result.vs[i] = __VA_ARGS__; \ - return result - -#define KFR_B_CYCLE(...) \ - vec<T, N> result; \ - for (size_t i = 0; i < N; i++) \ - result.vs[i] = (__VA_ARGS__) ? constants<value_type>::allones() : value_type(0); \ - return result.asmask() - -template <typename T, size_t N> -struct alignas(const_min(platform<>::maximum_vector_alignment, sizeof(T) * next_poweroftwo(N))) vec - : vec_t<T, N> -{ - constexpr static size_t simd_width = platform<T>::vector_width; - constexpr static size_t count = (N + simd_width - 1) / simd_width; - - static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type"); - - // type and size - using value_type = T; - constexpr static size_t size() noexcept { return N; } - - using scalar_type = T; - constexpr static size_t scalar_size() noexcept { return N; } - - using simd_type = typename simd_type_holder<T, N>::type; - - using uvalue_type = utype<T>; - using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>; - - using mask_t = mask<T, N>; - - using uvec = vec<uvalue_type, N>; - using iuvec = vec<iuvalue_type, N>; - - // constructors and assignment - // default - constexpr vec() noexcept = default; - // copy - vec(const vec&) noexcept = default; - // assignment - CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default; - - template <size_t... indices> - KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept - { - return vec<value_type, sizeof...(indices)>((indices < N ? vs[indices % N] : 0)...); - } - template <size_t... indices> - KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept - { - return vec<value_type, sizeof...(indices)>( - (indices < N ? vs[indices % N] : indices < 2 * N ? y.vs[(indices - N) % N] : 0)...); - } - - template <typename U, typename = enable_if<(std::is_convertible<U, value_type>::value)>> - KFR_I_CE vec(const U& s) noexcept - { - KFR_C_CYCLE(s); - } - - constexpr vec(const simd_type& simd) noexcept : simd(simd) {} - // from vector of another type - template <typename U, typename = enable_if<is_simd_type<U>::value>> - KFR_I_CE vec(const vec<U, N>& v) noexcept - { - KFR_C_CYCLE(static_cast<value_type>(v.vs[i])); - } - // from list - template <typename... Us> - KFR_I_CE vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept - : vs{ s0, s1, static_cast<value_type>(rest)... } - { - } - template <size_t N1, size_t... Ns, typename = enable_if<(csum<size_t, N1, Ns...>() == N)>> - KFR_I_CE vec(const vec<T, N1>& v0, const vec<T, Ns>&... vecs) noexcept : simd(*concat(v0, vecs...)) - { - } - - KFR_I_CE vec(czeros_t) noexcept { KFR_C_CYCLE(value_type(0)); } - KFR_I_CE vec(cones_t) noexcept { KFR_C_CYCLE(constants<value_type>::allones()); } - - template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)> - KFR_I_CE static vec frombits(const vec<U, M>& v) noexcept - { - vec r; - r.bytes = v.flatten().bytes; - return r; - } - - KFR_I_CE vec operator+() const noexcept { return *this; } - KFR_I_CE vec operator-() const noexcept { KFR_R_CYCLE(-this->vs[i]); } - KFR_I_CE vec operator~() const noexcept - { - uvec xx = uvec::frombits(*this); - KFR_CYCLE(xx.vs[i] = ~xx.vs[i]); - return frombits(xx); - } - - KFR_I_CE vec operator+(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] + y.vs[i]); } - KFR_I_CE vec operator-(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] - y.vs[i]); } - KFR_I_CE vec operator*(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] * y.vs[i]); } - KFR_I_CE vec operator/(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] / y.vs[i]); } - - KFR_I_CE vec operator<<(int shift) const noexcept - { - iuvec xx = iuvec::frombits(*this); - KFR_CYCLE(xx.vs[i] <<= shift); - return frombits(xx); - } - KFR_I_CE vec operator>>(int shift) const noexcept - { - iuvec xx = iuvec::frombits(*this); - KFR_CYCLE(xx.vs[i] >>= shift); - return frombits(xx); - } - KFR_I_CE vec operator&(const vec& y) const noexcept - { - uvec xx = uvec::frombits(*this); - uvec yy = uvec::frombits(y); - KFR_CYCLE(xx.vs[i] &= yy.vs[i]); - return frombits(xx); - } - KFR_I_CE vec operator|(const vec& y) const noexcept - { - uvec xx = uvec::frombits(*this); - uvec yy = uvec::frombits(y); - KFR_CYCLE(xx.vs[i] |= yy.vs[i]); - return frombits(xx); - } - KFR_I_CE vec operator^(const vec& y) const noexcept - { - uvec xx = uvec::frombits(*this); - uvec yy = uvec::frombits(y); - KFR_CYCLE(xx.vs[i] ^= yy.vs[i]); - return frombits(xx); - } - - KFR_I_CE mask_t operator==(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] == y.vs[i]); } - KFR_I_CE mask_t operator!=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] != y.vs[i]); } - KFR_I_CE mask_t operator<(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] < y.vs[i]); } - KFR_I_CE mask_t operator>(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] > y.vs[i]); } - KFR_I_CE mask_t operator<=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] <= y.vs[i]); } - KFR_I_CE mask_t operator>=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] >= y.vs[i]); } - - constexpr mask_t asmask() const noexcept { return mask_t(simd); } - - KFR_I_CE vec& operator+=(const vec& y) noexcept { return *this = *this + y; } - KFR_I_CE vec& operator-=(const vec& y) noexcept { return *this = *this - y; } - KFR_I_CE vec& operator*=(const vec& y) noexcept { return *this = *this * y; } - KFR_I_CE vec& operator/=(const vec& y) noexcept { return *this = *this / y; } - KFR_I_CE vec& operator<<=(int shift) noexcept { return *this = *this << shift; } - KFR_I_CE vec& operator>>=(int shift) noexcept { return *this = *this >> shift; } - KFR_I_CE vec& operator&=(const vec& y) noexcept { return *this = *this & y; } - KFR_I_CE vec& operator|=(const vec& y) noexcept { return *this = *this | y; } - KFR_I_CE vec& operator^=(const vec& y) noexcept { return *this = *this ^ y; } - - KFR_I_CE vec& operator++() noexcept { return *this = *this + vec(1); } - KFR_I_CE vec& operator--() noexcept { return *this = *this - vec(1); } - KFR_I_CE vec operator++(int) noexcept - { - const vec z = *this; - ++*this; - return z; - } - KFR_I_CE vec operator--(int) noexcept - { - const vec z = *this; - --*this; - return z; - } - - explicit KFR_I_CE vec(const value_type* src) { KFR_C_CYCLE(src[i]); } - explicit KFR_I_CE vec(const value_type* src, cunaligned_t) { KFR_C_CYCLE(src[i]); } - explicit KFR_I_CE vec(const value_type* src, caligned_t) { KFR_C_CYCLE(src[i]); } - - const vec& write(value_type* dest) const - { - KFR_CYCLE(dest[i] = vs[i]); - return *this; - } - const vec& write(value_type* dest, cunaligned_t) const - { - KFR_CYCLE(dest[i] = vs[i]); - return *this; - } - const vec& write(value_type* dest, caligned_t) const - { - KFR_CYCLE(dest[i] = vs[i]); - return *this; - } - - KFR_I_CE value_type operator[](size_t index) const noexcept { return vs[index]; } - KFR_I_CE value_type& operator[](size_t index) noexcept { return vs[index]; } - - const vec& flatten() const noexcept { return *this; } - simd_type operator*() const noexcept { return simd; } - simd_type& operator*() noexcept { return simd; } - -protected: - template <typename, size_t> - friend struct vec; - - union { - T vs[N]; - simd_type simd; - raw_bytes<N * sizeof(T)> bytes; - }; -}; - -namespace internal -{ -template <typename T, size_t N> -CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x) -{ - return x; -} - -template <typename T, size_t N> -CMT_INLINE vec<T, N * 2> concat_impl(const vec<T, N>& x, const vec<T, N>& y) -{ - return x.shuffle(y, csizeseq_t<N * 2>()); -} - -template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)> -CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<N1 + N2>()); -} - -template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)> -CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return x.shuffle(csizeseq_t<N2, -(N2 - N1)>()) - .shuffle(y, csizeseq_t<N2 * 2>()) - .shuffle(csizeseq_t<N1 + N2, N2 - N1>()); -} - -template <typename T, size_t N1, size_t N2, size_t... Sizes> -CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y, - const vec<T, Sizes>&... args) -{ - return concat_impl(concat_impl(x, y), args...); -} -} // namespace internal - -template <typename T, size_t... Ns> -constexpr inline vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept -{ - return internal::concat_impl(vs...); -} -} // namespace kfr - -CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/simd_x86.hpp b/include/kfr/base/simd_x86.hpp @@ -1,272 +0,0 @@ -#pragma once - -#include "constants.hpp" -#include "platform.hpp" -#include "simd_intrin.hpp" -namespace kfr -{ -#ifdef CMT_ARCH_SSE2 - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator+(const vec<f32, 4>& y) const noexcept -{ - return _mm_add_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator-(const vec<f32, 4>& y) const noexcept -{ - return _mm_sub_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator*(const vec<f32, 4>& y) const noexcept -{ - return _mm_mul_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator/(const vec<f32, 4>& y) const noexcept -{ - return _mm_div_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator&(const vec<f32, 4>& y) const noexcept -{ - return _mm_and_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator|(const vec<f32, 4>& y) const noexcept -{ - return _mm_or_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator^(const vec<f32, 4>& y) const noexcept -{ - return _mm_xor_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator+(const vec<f64, 2>& y) const noexcept -{ - return _mm_add_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator-(const vec<f64, 2>& y) const noexcept -{ - return _mm_sub_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator*(const vec<f64, 2>& y) const noexcept -{ - return _mm_mul_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator/(const vec<f64, 2>& y) const noexcept -{ - return _mm_div_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator&(const vec<f64, 2>& y) const noexcept -{ - return _mm_and_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator|(const vec<f64, 2>& y) const noexcept -{ - return _mm_or_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator^(const vec<f64, 2>& y) const noexcept -{ - return _mm_xor_pd(simd, y.simd); -} - -#endif // CMT_ARCH_SSE2 - -#ifdef CMT_ARCH_AVX - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator+(const vec<f32, 8>& y) const noexcept -{ - return _mm256_add_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator-(const vec<f32, 8>& y) const noexcept -{ - return _mm256_sub_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator*(const vec<f32, 8>& y) const noexcept -{ - return _mm256_mul_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator/(const vec<f32, 8>& y) const noexcept -{ - return _mm256_div_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator&(const vec<f32, 8>& y) const noexcept -{ - return _mm256_and_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator|(const vec<f32, 8>& y) const noexcept -{ - return _mm256_or_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator^(const vec<f32, 8>& y) const noexcept -{ - return _mm256_xor_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator+(const vec<f64, 4>& y) const noexcept -{ - return _mm256_add_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator-(const vec<f64, 4>& y) const noexcept -{ - return _mm256_sub_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator*(const vec<f64, 4>& y) const noexcept -{ - return _mm256_mul_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator/(const vec<f64, 4>& y) const noexcept -{ - return _mm256_div_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator&(const vec<f64, 4>& y) const noexcept -{ - return _mm256_and_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator|(const vec<f64, 4>& y) const noexcept -{ - return _mm256_or_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator^(const vec<f64, 4>& y) const noexcept -{ - return _mm256_xor_pd(simd, y.simd); -} - -#endif // CMT_ARCH_AVX - -#ifdef CMT_ARCH_AVX512 - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator+(const vec<f32, 16>& y) const noexcept -{ - return _mm512_add_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator-(const vec<f32, 16>& y) const noexcept -{ - return _mm512_sub_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator*(const vec<f32, 16>& y) const noexcept -{ - return _mm512_mul_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator/(const vec<f32, 16>& y) const noexcept -{ - return _mm512_div_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator&(const vec<f32, 16>& y) const noexcept -{ - return _mm512_and_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator|(const vec<f32, 16>& y) const noexcept -{ - return _mm512_or_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator^(const vec<f32, 16>& y) const noexcept -{ - return _mm512_xor_ps(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator+(const vec<f64, 8>& y) const noexcept -{ - return _mm512_add_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator-(const vec<f64, 8>& y) const noexcept -{ - return _mm512_sub_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator*(const vec<f64, 8>& y) const noexcept -{ - return _mm512_mul_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator/(const vec<f64, 8>& y) const noexcept -{ - return _mm512_div_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator&(const vec<f64, 8>& y) const noexcept -{ - return _mm512_and_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator|(const vec<f64, 8>& y) const noexcept -{ - return _mm512_or_pd(simd, y.simd); -} - -template <> -KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator^(const vec<f64, 8>& y) const noexcept -{ - return _mm512_xor_pd(simd, y.simd); -} - -#endif // CMT_ARCH_AVX - -} // namespace kfr diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp @@ -1,315 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/sin_cos.hpp" - -namespace kfr -{ - -/** - * @brief Returns the trigonometric sine of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sin(const T1& x) -{ - return intrinsics::sin(x); -} - -/** - * @brief Returns template expression that returns the trigonometric sine of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sin, E1> sin(E1&& x) -{ - return { fn::sin(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric cosine of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> cos(const T1& x) -{ - return intrinsics::cos(x); -} - -/** - * @brief Returns template expression that returns the trigonometric cosine of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cos, E1> cos(E1&& x) -{ - return { fn::cos(), std::forward<E1>(x) }; -} - -/** - * @brief Returns an approximation of the trigonometric sine of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> fastsin(const T1& x) -{ - return intrinsics::fastsin(x); -} - -/** - * @brief Returns template expression that returns an approximation of the trigonometric sine of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::fastsin, E1> fastsin(E1&& x) -{ - return { fn::fastsin(), std::forward<E1>(x) }; -} - -/** - * @brief Returns an approximation of the trigonometric cosine of x. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> fastcos(const T1& x) -{ - return intrinsics::fastcos(x); -} - -/** - * @brief Returns template expression that returns an approximation of the trigonometric cosine of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::fastcos, E1> fastcos(E1&& x) -{ - return { fn::fastcos(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must - * be a vector. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sincos(const T1& x) -{ - return intrinsics::sincos(x); -} - -/** - * @brief Returns template expression that returns the trigonometric sine of the even elements of the x and - * cosine of the odd elements. x must be a vector. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sincos, E1> sincos(E1&& x) -{ - return { fn::sincos(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must - * be a vector. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> cossin(const T1& x) -{ - return intrinsics::cossin(x); -} - -/** - * @brief Returns template expression that returns the trigonometric cosine of the even elements of the x and - * sine of the odd elements. x must be a vector. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cossin, E1> cossin(E1&& x) -{ - return { fn::cossin(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric sine of the x (expressed in degrees). - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sindeg(const T1& x) -{ - return intrinsics::sindeg(x); -} - -/** - * @brief Returns template expression that returns the trigonometric sine of the x (expressed in degrees). - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sindeg, E1> sindeg(E1&& x) -{ - return { fn::sindeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric cosine of the x (expressed in degrees). - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> cosdeg(const T1& x) -{ - return intrinsics::cosdeg(x); -} - -/** - * @brief Returns template expression that returns the trigonometric cosine of the x (expressed in degrees). - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cosdeg, E1> cosdeg(E1&& x) -{ - return { fn::cosdeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns an approximation of the trigonometric sine of the x (expressed in degrees). - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> fastsindeg(const T1& x) -{ - return intrinsics::fastsindeg(x); -} - -/** - * @brief Returns template expression that returns an approximation of the trigonometric sine of the x - * (expressed in degrees). - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::fastsindeg, E1> fastsindeg(E1&& x) -{ - return { fn::fastsindeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns an approximation of the trigonometric cosine of the x (expressed in degrees). - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> fastcosdeg(const T1& x) -{ - return intrinsics::fastcosdeg(x); -} - -/** - * @brief Returns template expression that returns an approximation of the trigonometric cosine of the x - * (expressed in degrees). - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::fastcosdeg, E1> fastcosdeg(E1&& x) -{ - return { fn::fastcosdeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must - * be a vector and expressed in degrees. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sincosdeg(const T1& x) -{ - return intrinsics::sincosdeg(x); -} - -/** - * @brief Returns template expression that returns the trigonometric sine of the even elements of the x and - * cosine of the odd elements. x must be expressed in degrees. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sincosdeg, E1> sincosdeg(E1&& x) -{ - return { fn::sincosdeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must - * be a vector and expressed in degrees. - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> cossindeg(const T1& x) -{ - return intrinsics::cossindeg(x); -} - -/** - * @brief Returns template expression that returns the trigonometric cosine of the even elements of the x and - * sine of the odd elements. x must be expressed in degrees. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::cossindeg, E1> cossindeg(E1&& x) -{ - return { fn::cossindeg(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the sinc function of x. - * \f[ - * sinc(x) = \frac{sin(x)}{x} - * \f] - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> sinc(const T1& x) -{ - return intrinsics::sinc(x); -} - -/** - * @brief Returns template expression that returns the sinc function of x. - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sinc, E1> sinc(E1&& x) -{ - return { fn::sinc(), std::forward<E1>(x) }; -} - -/** - * @brief Returns the trigonometric sine of the angle 2x using sin(x) and cos(x). - */ -template <typename T> -KFR_SINTRIN T sin2x(const T& sinx, const T& cosx) -{ - return 2 * sinx * cosx; -} - -/** - * @brief Returns the trigonometric sine of the angle 3x using sin(x) and cos(x). - */ -template <typename T> -KFR_SINTRIN T sin3x(const T& sinx, const T& cosx) -{ - return sinx * (-1 + 4 * sqr(cosx)); -} - -/** - * @brief Returns the trigonometric cosine of the angle 2x using sin(x) and cos(x). - */ -template <typename T> -KFR_SINTRIN T cos2x(const T& sinx, const T& cosx) -{ - return sqr(cosx) - sqr(sinx); -} - -/** - * @brief Returns the trigonometric cosine of the angle 3x using sin(x) and cos(x). - */ -template <typename T> -KFR_SINTRIN T cos3x(const T& sinx, const T& cosx) -{ - return cosx * (1 - 4 * sqr(sinx)); -} -} // namespace kfr diff --git a/include/kfr/base/small_buffer.hpp b/include/kfr/base/small_buffer.hpp @@ -1,4 +1,4 @@ -/** @addtogroup utility +/** @addtogroup types * @{ */ /* @@ -31,16 +31,15 @@ namespace kfr { - template <typename T, std::size_t Capacity = 16> struct small_buffer { public: - small_buffer() noexcept : m_size(0), m_data(m_preallocated) {} + small_buffer() CMT_NOEXCEPT : m_size(0), m_data(m_preallocated) {} small_buffer(std::size_t size) : small_buffer() { resize(size); } - friend void swap(small_buffer<T, Capacity>& first, small_buffer<T, Capacity>& second) noexcept + friend void swap(small_buffer<T, Capacity>& first, small_buffer<T, Capacity>& second) CMT_NOEXCEPT { using std::swap; diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp @@ -25,12 +25,15 @@ */ #pragma once -#include "min_max.hpp" -#include "shuffle.hpp" -#include "vec.hpp" +#include "../math/min_max.hpp" +#include "../simd/shuffle.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ + /** * @brief Sort the elements in the vector in ascending order * @param x input vector @@ -40,12 +43,12 @@ namespace kfr * @endcode */ template <typename T, size_t N> -CMT_INLINE vec<T, N> sort(const vec<T, N>& x) +KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x) { constexpr size_t Nhalf = N / 2; vec<T, Nhalf> e = low(x); vec<T, Nhalf> o = high(x); - constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>()); + constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); for (size_t i = 0; i < Nhalf; i++) { vec<T, Nhalf> t; @@ -73,12 +76,12 @@ CMT_INLINE vec<T, N> sort(const vec<T, N>& x) * @endcode */ template <typename T, size_t N> -CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x) +KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x) { constexpr size_t Nhalf = N / 2; vec<T, Nhalf> e = low(x); vec<T, Nhalf> o = high(x); - constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>()); + constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); for (size_t i = 0; i < Nhalf; i++) { vec<T, Nhalf> t; @@ -96,4 +99,5 @@ CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x) } return interleavehalfs(concat(e, o)); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/specializations.i b/include/kfr/base/specializations.i @@ -1,109 +0,0 @@ -/** - * Copyright (C) 2016 D Levin (http://www.kfrlib.com) - * This file is part of KFR - * - * KFR is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * KFR is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with KFR. - */ -#pragma once - -#include "vec.hpp" -#ifndef KFR_SHUFFLE_SPECIALIZATIONS -#include "shuffle.hpp" -#endif - -#ifdef KFR_COMPILER_GNU - -namespace kfr -{ -namespace internal -{ -template <> -inline vec<f32, 32> shufflevector<f32, 32>( - csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, - 15, 22, 23, 30, 31>, - const vec<f32, 32>& x, const vec<f32, 32>&) -{ - f32x32 w = x; - - w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)), - permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w))); - - w = permutegroups<(4), 0, 4, 2, 6, 1, 5, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op - return w; -} - -template <> -inline vec<f32, 32> shufflevector<f32, 32>( - csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22, - 23, 14, 15, 30, 31>, - const vec<f32, 32>& x, const vec<f32, 32>&) -{ - f32x32 w = x; - - w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)), - permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w))); - - w = permutegroups<(4), 0, 4, 1, 5, 2, 6, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op - return w; -} - -inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x) -{ - return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, - 19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>, - x, x); -} - -template <> -inline vec<f32, 64> shufflevector<f32, 64>( - csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13, - 44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38, - 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>, - const vec<f32, 64>& x, const vec<f32, 64>&) -{ - return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x)))); -} - -template <> -inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>, - const vec<f32, 16>& x, const vec<f32, 16>&) -{ -// asm volatile("int $3"); - const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x); - - return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)), shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx))); -} - -template <> -inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>, - const vec<f32, 16>& x, const vec<f32, 16>&) -{ - const vec<f32, 16> xx = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); - - return permutegroups<(4), 0, 2, 1, 3>(xx); -} - -template <> -inline vec<f32, 32> shufflevector<f32, 32>( - csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, - 29, 14, 30, 15, 31>, - const vec<f32, 32>& x, const vec<f32, 32>&) -{ - const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x); - - return concat(interleavehalfs(low(xx)), interleavehalfs(high(xx))); -} -} -} -#endif diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp @@ -1,50 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/sqrt.hpp" - -namespace kfr -{ - -/** - * @brief Returns the positive square root of the x. \f$\sqrt{x}\f$ - */ -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> sqrt(const T1& x) -{ - return intrinsics::sqrt(x); -} - -/** - * @brief Returns template expression that returns the positive square root of the x. \f$\sqrt{x}\f$ - */ -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::sqrt, E1> sqrt(E1&& x) -{ - return { fn::sqrt(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp @@ -1,56 +0,0 @@ -/** @addtogroup math - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "impl/tan.hpp" - -namespace kfr -{ - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> tan(const T1& x) -{ - return intrinsics::tan(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::tan, E1> tan(E1&& x) -{ - return { fn::tan(), std::forward<E1>(x) }; -} - -template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC flt_type<T1> tandeg(const T1& x) -{ - return intrinsics::tandeg(x); -} - -template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::tandeg, E1> tandeg(E1&& x) -{ - return { fn::tandeg(), std::forward<E1>(x) }; -} -} // namespace kfr diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp @@ -1,429 +0,0 @@ -/** @addtogroup types - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once -#include "kfr.h" - -#include "intrinsics.h" - -#include <cmath> - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") - -#ifdef KFR_TESTING -#include "../testo/testo.hpp" -#endif - -#include "../cometa.hpp" - -#define KFR_ENABLE_IF CMT_ENABLE_IF - -/** - * @brief Internal macro for functions - */ -#define KFR_FN(FN) \ - namespace fn \ - { \ - struct FN \ - { \ - template <typename... Args> \ - CMT_INLINE_MEMBER decltype(::kfr::FN(std::declval<Args>()...)) operator()(Args&&... args) const \ - { \ - return ::kfr::FN(std::forward<Args>(args)...); \ - } \ - }; \ - } - -/** - * @brief Internal macro for functions - */ -#define KFR_I_FN(FN) \ - namespace fn \ - { \ - struct FN \ - { \ - template <typename... Args> \ - CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()( \ - Args&&... args) const \ - { \ - return ::kfr::intrinsics::FN(std::forward<Args>(args)...); \ - } \ - }; \ - } - -namespace kfr -{ -// Include all from CoMeta library -using namespace cometa; - -/// @brief Short names for common types -using f32 = float; -using f64 = double; -using i8 = int8_t; -using i16 = int16_t; -using i32 = int32_t; -using i64 = int64_t; -using u8 = uint8_t; -using u16 = uint16_t; -using u32 = uint32_t; -using u64 = uint64_t; -using umax = uint64_t; -using imax = int64_t; -using fmax = double; -using f80 = long double; - -#if defined(KFR_BASETYPE_F32) || defined(KFR_NO_NATIVE_F64) -/// @brief Floating point type used by default -using fbase = f32; -#else -/// @brief Floating point type used by default -using fbase = f64; -#endif - -constexpr ctype_t<f32> ctype_f32{}; -constexpr ctype_t<f64> ctype_f64{}; -constexpr ctype_t<i8> ctype_i8{}; -constexpr ctype_t<i16> ctype_i16{}; -constexpr ctype_t<i32> ctype_i32{}; -constexpr ctype_t<i64> ctype_i64{}; -constexpr ctype_t<u8> ctype_u8{}; -constexpr ctype_t<u16> ctype_u16{}; -constexpr ctype_t<u32> ctype_u32{}; -constexpr ctype_t<u64> ctype_u64{}; -constexpr ctype_t<umax> ctype_umax{}; -constexpr ctype_t<imax> ctype_imax{}; -constexpr ctype_t<fmax> ctype_fmax{}; -constexpr ctype_t<f80> ctype_f80{}; -constexpr ctype_t<fbase> ctype_base{}; - -struct u24 -{ - u8 raw[3]; -}; - -struct i24 -{ - u8 raw[3]; - - i24(i32 x) - { - raw[0] = x & 0xFF; - raw[1] = (x >> 8) & 0xFF; - raw[2] = (x >> 16) & 0xFF; - } - - i32 as_int() const - { - return static_cast<i32>(raw[0]) | static_cast<i32>(raw[1] << 8) | - (static_cast<i32>(raw[2] << 24) >> 8); - } - - operator int() const { return as_int(); } -}; - -struct f16 -{ - u16 raw; -}; - -/// @brief An enumeration representing data type -template <typename T1> -struct range -{ - T1 min; - T1 max; - T1 distance() const { return max - min; } -}; - -/// @brief An enumeration representing data type -enum class datatype : int -{ - typebits_mask = 0xFF, - f = 0x100, - i = 0x200, - u = 0x300, - c = 0x400, - typeclass_mask = 0xF00, - x1 = 0x1000, - x2 = 0x2000, - x3 = 0x3000, - x4 = 0x4000, - typecomponents_mask = 0xF000, - f16 = static_cast<int>(f) | static_cast<int>(x1) | 16, - f32 = static_cast<int>(f) | static_cast<int>(x1) | 32, - f64 = static_cast<int>(f) | static_cast<int>(x1) | 64, - f80 = static_cast<int>(f) | static_cast<int>(x1) | 80, - i8 = static_cast<int>(i) | static_cast<int>(x1) | 8, - i16 = static_cast<int>(i) | static_cast<int>(x1) | 16, - i24 = static_cast<int>(i) | static_cast<int>(x1) | 24, - i32 = static_cast<int>(i) | static_cast<int>(x1) | 32, - i64 = static_cast<int>(i) | static_cast<int>(x1) | 64, - u8 = static_cast<int>(u) | static_cast<int>(x1) | 8, - u16 = static_cast<int>(u) | static_cast<int>(x1) | 16, - u24 = static_cast<int>(u) | static_cast<int>(x1) | 24, - u32 = static_cast<int>(u) | static_cast<int>(x1) | 32, - u64 = static_cast<int>(u) | static_cast<int>(x1) | 64, - c32 = static_cast<int>(c) | static_cast<int>(x2) | 32, - c64 = static_cast<int>(c) | static_cast<int>(x2) | 64 -}; - -inline datatype operator|(datatype x, datatype y) -{ - using type = underlying_type<datatype>; - return static_cast<datatype>(static_cast<type>(x) | static_cast<type>(y)); -} - -inline datatype operator&(datatype x, datatype y) -{ - using type = underlying_type<datatype>; - return static_cast<datatype>(static_cast<type>(x) & static_cast<type>(y)); -} - -template <typename T> -constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value - ? datatype::f - : std::is_integral<typename compound_type_traits<T>::subtype>::value - ? (std::is_unsigned<typename compound_type_traits<T>::subtype>::value - ? datatype::u - : datatype::i) - : datatype(); - -template <typename T> -using is_f_class = std::integral_constant<bool, typeclass<T> == datatype::f>; -template <typename T> -using is_u_class = std::integral_constant<bool, typeclass<T> == datatype::u>; -template <typename T> -using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>; - -template <typename T> -struct typebits -{ - static_assert(is_number<deep_subtype<T>>::value, ""); - constexpr static size_t bits = sizeof(typename compound_type_traits<T>::subtype) * 8; - constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width; - using subtype = typename compound_type_traits<T>::subtype; -}; - -namespace fn -{ -///@copybrief cometa::pass_through -using pass_through = cometa::fn_pass_through; - -///@copybrief cometa::noop -using noop = cometa::fn_noop; - -///@copybrief cometa::get_first -using get_first = cometa::fn_get_first; - -///@copybrief cometa::get_second -using get_second = cometa::fn_get_second; - -///@copybrief cometa::get_third -using get_third = cometa::fn_get_third; - -///@copybrief cometa::returns -template <typename T> -using returns = cometa::fn_returns<T>; -} // namespace fn - -template <typename T> -using ftype = - typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>; -template <typename T> -using itype = - typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>; -template <typename T> -using utype = - typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>; - -template <typename T> -using fsubtype = ftype<subtype<T>>; -template <typename T> -using isubtype = itype<subtype<T>>; -template <typename T> -using usubtype = utype<subtype<T>>; - -namespace internal -{ -template <typename T> -struct flt_type_impl -{ - using type = fbase; -}; - -template <> -struct flt_type_impl<float> -{ - using type = float; -}; -template <> -struct flt_type_impl<double> -{ - using type = double; -}; -} // namespace internal - -template <typename T> -using flt_type = typename internal::flt_type_impl<T>::type; - -namespace internal -{ -#ifdef CMT_COMPILER_CLANG -#define builtin_addressof(x) __builtin_addressof(x) -#else -template <class T> -inline T* builtin_addressof(T& arg) -{ - return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg))); -} -#endif - -#ifdef CMT_COMPILER_GNU -CMT_INLINE f32 builtin_sqrt(f32 x) { return __builtin_sqrtf(x); } -CMT_INLINE f64 builtin_sqrt(f64 x) { return __builtin_sqrt(x); } -CMT_INLINE f80 builtin_sqrt(f80 x) { return __builtin_sqrtl(x); } -CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) -{ - __builtin_memcpy(dest, src, size); -} -CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); } -#else - -CMT_INLINE f32 builtin_sqrt(f32 x) { return ::sqrtf(x); } -CMT_INLINE f64 builtin_sqrt(f64 x) { return ::sqrt(x); } -CMT_INLINE f80 builtin_sqrt(f80 x) { return ::sqrtl(x); } -CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) { ::memcpy(dest, src, size); } -CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { ::memset(dest, val, size); } - -#endif - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes") - -template <typename T, bool A> -struct struct_with_alignment -{ - T value; - KFR_INTRIN void operator=(T value) { this->value = value; } -}; - -template <typename T> -struct struct_with_alignment<T, false> -{ - T value; - KFR_INTRIN void operator=(T value) { this->value = value; } -} -#ifdef CMT_GNU_ATTRIBUTES -__attribute__((__packed__, __may_alias__)) // -#endif -; - -CMT_PRAGMA_GNU(GCC diagnostic pop) -} // namespace internal - -/// @brief Fills a value with zeros -template <typename T1> -CMT_INLINE void zeroize(T1& value) -{ - internal::builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1)); -} - -/// @brief Used to determine the initial value for reduce functions -template <typename T> -struct initialvalue -{ -}; - -namespace internal -{ -template <size_t width, typename Fn> -CMT_INLINE void block_process_impl(size_t& i, size_t size, Fn&& fn) -{ - CMT_LOOP_NOUNROLL - for (; i < size / width * width; i += width) - fn(i, csize_t<width>()); -} -} // namespace internal - -template <size_t... widths, typename Fn> -CMT_INLINE void block_process(size_t size, csizes_t<widths...>, Fn&& fn) -{ - size_t i = 0; - swallow{ (internal::block_process_impl<widths>(i, size, std::forward<Fn>(fn)), 0)... }; -} - -template <typename T> -struct is_simd_type - : std::integral_constant< - bool, std::is_same<T, float>::value || std::is_same<T, double>::value || - std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value || - std::is_same<T, short>::value || std::is_same<T, unsigned short>::value || - std::is_same<T, int>::value || std::is_same<T, unsigned int>::value || - std::is_same<T, long>::value || std::is_same<T, unsigned long>::value || - std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value> -{ -}; - -template <typename T, size_t N> -struct vec_t -{ - static_assert(N > 0 && N <= 1024, "Invalid vector size"); - - static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type"); - - using value_type = T; - constexpr static size_t size() noexcept { return N; } - constexpr vec_t() noexcept = default; - - using scalar_type = subtype<T>; - constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; } -}; - -constexpr size_t index_undefined = static_cast<size_t>(-1); - -struct czeros_t -{ -}; -struct cones_t -{ -}; -constexpr czeros_t czeros{}; -constexpr cones_t cones{}; - -using caligned_t = cbool_t<true>; -using cunaligned_t = cbool_t<false>; - -constexpr caligned_t caligned{}; -constexpr cunaligned_t cunaligned{}; - -#ifdef CMT_INTRINSICS_IS_CONSTEXPR -#define KFR_I_CE constexpr -#else -#define KFR_I_CE -#endif -} // namespace kfr - -CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp @@ -27,10 +27,10 @@ #include "../cometa/array.hpp" -#include "function.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/read_write.hpp" +#include "../simd/types.hpp" #include "memory.hpp" -#include "read_write.hpp" -#include "types.hpp" CMT_PRAGMA_MSVC(warning(push)) CMT_PRAGMA_MSVC(warning(disable : 4324)) @@ -97,20 +97,14 @@ struct univector_base : input_expression, output_expression using output_expression::end_block; template <typename U, size_t N> - CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value) + KFR_MEM_INTRINSIC void operator()(coutput_t, size_t index, const vec<U, N>& value) { T* data = derived_cast<Class>(this)->data(); write(ptr_cast<T>(data) + index, vec<T, N>(value)); } - template <typename U, size_t N> - CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const - { - const T* data = derived_cast<Class>(this)->data(); - return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index)); - } template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)> - CMT_INLINE Class& operator=(Input&& input) + KFR_MEM_INTRINSIC Class& operator=(Input&& input) { assign_expr(std::forward<Input>(input)); return *derived_cast<Class>(this); @@ -254,15 +248,15 @@ struct univector_base : input_expression, output_expression protected: template <typename Input> - CMT_INLINE void assign_expr(Input&& input) + KFR_MEM_INTRINSIC void assign_expr(Input&& input) { process(*derived_cast<Class>(this), std::forward<Input>(input)); } private: - CMT_INLINE size_t get_size() const { return derived_cast<Class>(this)->size(); } - CMT_INLINE const T* get_data() const { return derived_cast<Class>(this)->data(); } - CMT_INLINE T* get_data() { return derived_cast<Class>(this)->data(); } + KFR_MEM_INTRINSIC size_t get_size() const { return derived_cast<Class>(this)->size(); } + KFR_MEM_INTRINSIC const T* get_data() const { return derived_cast<Class>(this)->data(); } + KFR_MEM_INTRINSIC T* get_data() { return derived_cast<Class>(this)->data(); } static void copy(T* dest, const T* src, size_t size) { @@ -283,12 +277,12 @@ struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, S this->assign_expr(std::forward<Input>(input)); } template <typename... Args> - constexpr univector(const T& x, const Args&... args) noexcept + constexpr univector(const T& x, const Args&... args) CMT_NOEXCEPT : std::array<T, Size>{ { x, static_cast<T>(args)... } } { } - constexpr univector() noexcept(noexcept(std::array<T, Size>())) = default; + constexpr univector() CMT_NOEXCEPT_SPEC(noexcept(std::array<T, Size>())) = default; constexpr univector(size_t, const T& value) { std::fill(this->begin(), this->end(), value); } constexpr static bool size_known = true; constexpr static bool is_array = true; @@ -298,13 +292,13 @@ struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, S constexpr static bool is_pod = kfr::is_pod<T>::value; using value_type = T; - value_type get(size_t index, value_type fallback_value) const noexcept + value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT { return index < this->size() ? this->operator[](index) : fallback_value; } using univector_base<T, univector>::operator=; - void resize(size_t) noexcept {} + void resize(size_t) CMT_NOEXCEPT {} }; template <typename T> @@ -334,7 +328,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T constexpr univector(univector<U, Tag>& other) : array_ref<T>(other.data(), other.size()) { } - void resize(size_t) noexcept {} + void resize(size_t) CMT_NOEXCEPT {} constexpr static bool size_known = false; constexpr static bool is_array = false; constexpr static bool is_array_ref = true; @@ -342,7 +336,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T constexpr static bool is_aligned = false; using value_type = remove_const<T>; - value_type get(size_t index, value_type fallback_value) const noexcept + value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT { return index < this->size() ? this->operator[](index) : fallback_value; } @@ -364,9 +358,11 @@ struct univector<T, tag_dynamic_vector> : std::vector<T, allocator<T>>, this->resize(input.size()); this->assign_expr(std::forward<Input>(input)); } - constexpr univector() noexcept(noexcept(std::vector<T, allocator<T>>())) = default; + constexpr univector() CMT_NOEXCEPT_SPEC(noexcept(std::vector<T, allocator<T>>())) = default; constexpr univector(const std::vector<T, allocator<T>>& other) : std::vector<T, allocator<T>>(other) {} - constexpr univector(std::vector<T, allocator<T>>&& other) : std::vector<T, allocator<T>>(std::move(other)) {} + constexpr univector(std::vector<T, allocator<T>>&& other) : std::vector<T, allocator<T>>(std::move(other)) + { + } constexpr univector(const array_ref<T>& other) : std::vector<T, allocator<T>>(other.begin(), other.end()) { } @@ -378,19 +374,19 @@ struct univector<T, tag_dynamic_vector> : std::vector<T, allocator<T>>, constexpr univector(const std::vector<T, Allocator>&) = delete; template <typename Allocator> constexpr univector(std::vector<T, Allocator>&&) = delete; - constexpr static bool size_known = false; - constexpr static bool is_array = false; - constexpr static bool is_array_ref = false; - constexpr static bool is_vector = true; - constexpr static bool is_aligned = true; - using value_type = T; + constexpr static bool size_known = false; + constexpr static bool is_array = false; + constexpr static bool is_array_ref = false; + constexpr static bool is_vector = true; + constexpr static bool is_aligned = true; + using value_type = T; - value_type get(size_t index, value_type fallback_value) const noexcept + value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT { return index < this->size() ? this->operator[](index) : fallback_value; } template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)> - CMT_INLINE univector& operator=(Input&& input) + KFR_MEM_INTRINSIC univector& operator=(Input&& input) { if (input.size() != infinite_size) this->resize(input.size()); @@ -416,40 +412,18 @@ using univector3d = abstract_vector<abstract_vector<univector<T, Size3>, Size2>, /// @brief Creates univector from data and size template <typename T> -CMT_INLINE univector_ref<T> make_univector(T* data, size_t size) +KFR_INTRINSIC univector_ref<T> make_univector(T* data, size_t size) { return univector_ref<T>(data, size); } /// @brief Creates univector from data and size template <typename T> -CMT_INLINE univector_ref<const T> make_univector(const T* data, size_t size) +KFR_INTRINSIC univector_ref<const T> make_univector(const T* data, size_t size) { return univector_ref<const T>(data, size); } -/// @brief Converts an expression to univector -template <typename Expr, typename T = value_type_of<Expr>> -CMT_INLINE univector<T> render(Expr&& expr) -{ - static_assert(!is_infinite<Expr>::value, - "render: Can't process infinite expressions. Pass size as a second argument to render."); - univector<T> result; - result.resize(expr.size()); - result = expr; - return result; -} - -/// @brief Converts an expression to univector -template <typename Expr, typename T = value_type_of<Expr>> -CMT_INLINE univector<T> render(Expr&& expr, size_t size, size_t offset = 0) -{ - univector<T> result; - result.resize(size); - result = slice(expr, offset, size); - return result; -} - /// @brief Single producer single consumer lock-free ring buffer template <typename T> struct lockfree_ring_buffer @@ -476,8 +450,8 @@ struct lockfree_ring_buffer const size_t real_tail = cur_tail % buffer.size(); const size_t first_size = std::min(buffer.size() - real_tail, size); - internal::builtin_memcpy(buffer.data() + real_tail, source, first_size * sizeof(T)); - internal::builtin_memcpy(buffer.data(), source + first_size, (size - first_size) * sizeof(T)); + builtin_memcpy(buffer.data() + real_tail, source, first_size * sizeof(T)); + builtin_memcpy(buffer.data(), source + first_size, (size - first_size) * sizeof(T)); std::atomic_thread_fence(std::memory_order_release); @@ -500,8 +474,8 @@ struct lockfree_ring_buffer const size_t real_front = cur_front % buffer.size(); const size_t first_size = std::min(buffer.size() - real_front, size); - internal::builtin_memcpy(dest, buffer.data() + real_front, first_size * sizeof(T)); - internal::builtin_memcpy(dest + first_size, buffer.data(), (size - first_size) * sizeof(T)); + builtin_memcpy(dest, buffer.data() + real_front, first_size * sizeof(T)); + builtin_memcpy(dest + first_size, buffer.data(), (size - first_size) * sizeof(T)); std::atomic_thread_fence(std::memory_order_release); @@ -514,6 +488,47 @@ private: char cacheline_filler[64 - sizeof(std::atomic<size_t>)]; std::atomic<size_t> tail; }; +inline namespace CMT_ARCH_NAME +{ + +template <typename T, univector_tag Tag, typename U, size_t N> +KFR_INTRINSIC vec<U, N> get_elements(const univector<T, Tag>& self, cinput_t, size_t index, vec_shape<U, N>) +{ + const T* data = self.data(); + return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index)); +} + +/// @brief Converts an expression to univector +template <typename Expr, typename T = value_type_of<Expr>> +KFR_INTRINSIC univector<T> render(Expr&& expr) +{ + static_assert(!is_infinite<Expr>::value, + "render: Can't process infinite expressions. Pass size as a second argument to render."); + univector<T> result; + result.resize(expr.size()); + result = expr; + return result; +} + +/// @brief Converts an expression to univector +template <typename Expr, typename T = value_type_of<Expr>> +KFR_INTRINSIC univector<T> render(Expr&& expr, size_t size, size_t offset = 0) +{ + univector<T> result; + result.resize(size); + result = slice(expr, offset, size); + return result; +} + +/// @brief Converts an expression to univector +template <typename Expr, size_t Size, typename T = value_type_of<Expr>> +KFR_INTRINSIC univector<T, Size> render(Expr&& expr, csize_t<Size>) +{ + univector<T, Size> result; + result = expr; + return result; +} +} // namespace CMT_ARCH_NAME } // namespace kfr CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp @@ -1,1171 +0,0 @@ -/** @addtogroup types - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "kfr.h" - -#include "constants.hpp" -#include "platform.hpp" -#include "types.hpp" - -namespace kfr -{ - -template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)> -CMT_INLINE vec<T, Nout> low(const vec<T, N>& x); -template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)> -CMT_INLINE vec<T, Nout> high(const vec<T, N>& x); -} // namespace kfr - -#ifdef CMT_COMPILER_CLANG -#include "simd_clang.hpp" -#else -#include "simd_intrin.hpp" -#ifdef CMT_ARCH_X86 -#include "simd_x86.hpp" -#endif -#endif - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked") - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4814)) - -namespace kfr -{ - -template <typename T> -using maskfor = typename T::mask_t; - -template <typename T, size_t N> -struct mask : protected vec<T, N> -{ - using base = vec<T, N>; - KFR_I_CE mask() noexcept = default; - KFR_I_CE mask(const mask&) noexcept = default; - KFR_I_CE mask& operator=(const mask&) noexcept = default; - using simd_type = typename base::simd_type; - - simd_type operator*() const noexcept { return this->simd; } - simd_type& operator*() noexcept { return this->simd; } - - KFR_I_CE mask(const base& v) noexcept - //: base(base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec())) - { - this->simd = *base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()); - } - - KFR_I_CE mask(const simd_type& simd) : base(simd) {} - template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))> - KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m.asvec())) - { - } - template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))> - KFR_I_CE mask(const vec<U, N>& m) : base(base::frombits(m)) - { - } - KFR_I_CE mask operator&(const mask& y) const noexcept - { - return static_cast<const base&>(*this) & static_cast<const base&>(y); - } - KFR_I_CE mask operator|(const mask& y) const noexcept - { - return static_cast<const base&>(*this) | static_cast<const base&>(y); - } - KFR_I_CE mask operator&&(const mask& y) const noexcept - { - return static_cast<const base&>(*this) & static_cast<const base&>(y); - } - KFR_I_CE mask operator||(const mask& y) const noexcept - { - return static_cast<const base&>(*this) | static_cast<const base&>(y); - } - KFR_I_CE mask operator^(const mask& y) const noexcept - { - return static_cast<const base&>(*this) ^ static_cast<const base&>(y); - } - KFR_I_CE mask operator~() const noexcept { return ~static_cast<const base&>(*this); } - - bool operator[](size_t index) const noexcept; - - constexpr base asvec() const noexcept { return reinterpret_cast<const base&>(*this); } -}; - -namespace internal -{ - -constexpr inline size_t scale_get_index(size_t counter, size_t groupsize, size_t index) -{ - return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index); -} - -template <size_t counter, size_t groupsize, size_t... indices> -constexpr inline size_t scale_get_index(csizes_t<indices...>) -{ - return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>())); -} - -template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)> -constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) noexcept - -> csizes_t<scale_get_index<counter, groupsize>(ind)...> -{ - return {}; -} -} // namespace internal - -template <size_t groupsize, size_t... indices> -constexpr inline auto scale() noexcept -{ - return internal::scale_impl(csizes_t<indices...>(), csizeseq_t<sizeof...(indices) * groupsize>()); -} - -template <typename T, size_t Nin, size_t N> -struct vec<vec<T, Nin>, N> : private vec<T, Nin * N> -{ - using base = vec<T, Nin * N>; - - using value_type = vec<T, Nin>; - constexpr static size_t size() noexcept { return N; } - - using scalar_type = T; - constexpr static size_t scalar_size() noexcept { return Nin * N; } - - using simd_type = typename base::simd_type; - - constexpr vec() noexcept = default; - constexpr vec(const vec&) noexcept = default; - CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default; - constexpr vec(const simd_type& simd) noexcept : base(simd) {} - constexpr vec(czeros_t) noexcept : base(czeros) {} - constexpr vec(cones_t) noexcept : base(cones) {} - - constexpr vec(const value_type& v) noexcept : base(v.shuffle(csizeseq_t<Nin * N>() % csize_t<Nin>())) {} - - template <int = 0> - explicit constexpr vec(const vec<T, Nin * N>& v) noexcept : base(v) - { - } - - // from list of vectors - template <typename... Us> - constexpr vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept - : base(s0, s1, rest...) - { - } - - template <typename U> - constexpr vec(const vec<vec<U, Nin>, N>& v) noexcept : base(static_cast<vec<T, Nin * N>>(v.flatten())) - { - } - - template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)> - constexpr static vec frombits(const vec<U, M>& v) noexcept - { - return vec(base::frombits(v.flatten())); - } - - // math / bitwise / comparison operators - constexpr friend vec operator+(const vec& x) noexcept { return x; } - constexpr friend vec operator-(const vec& x) noexcept { return base::operator-(x); } - constexpr friend vec operator~(const vec& x) noexcept { return base::operator~(x); } - -#define KFR_B(x) static_cast<const base&>(x) - - constexpr friend vec operator+(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) + KFR_B(y)); } - constexpr friend vec operator-(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) - KFR_B(y)); } - constexpr friend vec operator*(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) * KFR_B(y)); } - constexpr friend vec operator/(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) / KFR_B(y)); } - - constexpr friend vec operator<<(const vec& x, int shift) noexcept { return vec(KFR_B(x) << shift); } - constexpr friend vec operator>>(const vec& x, int shift) noexcept { return vec(KFR_B(x) >> shift); } - constexpr friend vec operator&(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) & KFR_B(y)); } - constexpr friend vec operator|(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) | KFR_B(y)); } - constexpr friend vec operator^(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) ^ KFR_B(y)); } - -#undef KFR_B - - constexpr friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; } - constexpr friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; } - constexpr friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; } - constexpr friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; } - - constexpr friend vec& operator<<=(vec& x, int shift) noexcept { return x = x << shift; } - constexpr friend vec& operator>>=(vec& x, int shift) noexcept { return x = x >> shift; } - constexpr friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; } - constexpr friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; } - constexpr friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; } - - constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); } - constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); } - constexpr friend vec operator++(vec& x, int) noexcept - { - const vec z = x; - ++x; - return z; - } - constexpr friend vec operator--(vec& x, int) noexcept - { - const vec z = x; - --x; - return z; - } - - // shuffle - template <size_t... indices> - constexpr vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept - { - return *base::shuffle(scale<Nin, indices...>()); - } - template <size_t... indices> - constexpr vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept - { - return *base::shuffle(y, scale<Nin, indices...>()); - } - - // element access - struct element; - CMT_GNU_CONSTEXPR value_type operator[](size_t index) const noexcept { return get(index); } - CMT_GNU_CONSTEXPR element operator[](size_t index) noexcept { return { *this, index }; } - - CMT_GNU_CONSTEXPR value_type get(size_t index) const noexcept - { - return reinterpret_cast<const value_type(&)[N]>(*this)[index]; - } - CMT_GNU_CONSTEXPR void set(size_t index, const value_type& s) noexcept - { - reinterpret_cast<value_type(&)[N]>(*this)[index] = s; - } - template <size_t index> - CMT_GNU_CONSTEXPR value_type get(csize_t<index>) const noexcept - { - return static_cast<const base&>(*this).shuffle(csizeseq_t<Nin, index * Nin>()); - } - template <size_t index> - CMT_GNU_CONSTEXPR void set(csize_t<index>, const value_type& s) noexcept - { - *this = vec(static_cast<const base&>(*this)) - .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * Nin>() && - csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) * - N); - } - struct element - { - constexpr operator value_type() const noexcept { return v.get(index); } - element& operator=(const value_type& s) noexcept - { - v.set(index, s); - return *this; - } - vec& v; - size_t index; - }; - - template <bool aligned = false> - explicit constexpr vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept - : base(ptr_cast<T>(src), cbool_t<aligned>()) - { - } - template <bool aligned = false> - const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept - { - base::write(ptr_cast<T>(dest), cbool_t<aligned>()); - return *this; - } - - const base& flatten() const noexcept { return *this; } - simd_type operator*() const noexcept { return base::operator*(); } - simd_type& operator*() noexcept { return base::operator*(); } -}; - -namespace internal -{ - -template <typename T> -constexpr inline T maskbits(bool value) -{ - return value ? constants<T>::allones() : T(); -} - -template <typename T, size_t N> -struct flt_type_impl<vec<T, N>> -{ - using type = vec<typename flt_type_impl<T>::type, N>; -}; - -template <typename T> -struct is_vec_impl : std::false_type -{ -}; - -template <typename T, size_t N> -struct is_vec_impl<vec<T, N>> : std::true_type -{ -}; -} // namespace internal - -template <typename T> -using is_vec = internal::is_vec_impl<T>; - -template <typename To, typename From, size_t N, - KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value), - size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width> -constexpr CMT_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept -{ - return vec<To, Nout>(value.flatten()); -} - -#ifdef KFR_ENABLE_SWIZZLE -namespace swizzle -{ -template <size_t> -struct swiz -{ - constexpr swiz() {} -}; - -constexpr swiz<0> x{}; -constexpr swiz<1> y{}; -constexpr swiz<2> z{}; -constexpr swiz<3> w{}; -constexpr swiz<0> r{}; -constexpr swiz<1> g{}; -constexpr swiz<2> b{}; -constexpr swiz<3> a{}; -constexpr swiz<0> s{}; -constexpr swiz<1> t{}; -constexpr swiz<2> p{}; -constexpr swiz<3> q{}; - -constexpr swiz<0> s0{}; -constexpr swiz<1> s1{}; -constexpr swiz<2> s2{}; -constexpr swiz<3> s3{}; -constexpr swiz<4> s4{}; -constexpr swiz<5> s5{}; -constexpr swiz<6> s6{}; -constexpr swiz<7> s7{}; -constexpr swiz<8> s8{}; -constexpr swiz<9> s9{}; -constexpr swiz<10> s10{}; -constexpr swiz<11> s11{}; -constexpr swiz<12> s12{}; -constexpr swiz<13> s13{}; -constexpr swiz<14> s14{}; -constexpr swiz<15> s15{}; -} // namespace swizzle -#endif - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast") - -template <size_t N, typename T> -constexpr CMT_INLINE vec<T, N> broadcast(T x) -{ - return x; -} - -CMT_PRAGMA_GNU(GCC diagnostic pop) - -namespace internal -{ - -template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>, - size_t Nout = N* compound_type_traits<To>::deep_width> -constexpr CMT_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept -{ - return vec<To, N>(value); -} - -// scalar to scalar -template <typename To, typename From> -struct conversion -{ - static_assert(std::is_convertible<From, To>::value, ""); - static To cast(const From& value) { return value; } -}; - -// vector to vector -template <typename To, typename From, size_t N> -struct conversion<vec<To, N>, vec<From, N>> -{ - static_assert(!is_compound<To>::value, ""); - static_assert(!is_compound<From>::value, ""); - static vec<To, N> cast(const vec<From, N>& value) { return builtin_convertvector<To>(value); } -}; - -// vector<vector> to vector<vector> -template <typename To, typename From, size_t N1, size_t N2> -struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, N1>, N2>> -{ - static_assert(!is_compound<To>::value, ""); - static_assert(!is_compound<From>::value, ""); - static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value) - { - return builtin_convertvector<vec<To, N1>>(value); - } -}; - -// scalar to vector -template <typename To, typename From, size_t N> -struct conversion<vec<To, N>, From> -{ - static_assert(std::is_convertible<From, To>::value, ""); - static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); } -}; -} // namespace internal - -template <typename T> -constexpr size_t size_of() noexcept -{ - return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width; -} - -template <typename From, size_t N, typename Tsub = deep_subtype<From>, - size_t Nout = N* size_of<From>() / size_of<Tsub>()> -constexpr CMT_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& x) noexcept -{ - return x.flatten(); -} - -template <typename To, typename From, - typename Tout = typename compound_type_traits<From>::template deep_rebind<To>> -constexpr CMT_INLINE Tout cast(const From& value) noexcept -{ - return static_cast<Tout>(value); -} - -template <typename To, typename From> -CMT_GNU_CONSTEXPR CMT_INLINE To bitcast(const From& value) noexcept -{ - static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types"); - union { - From from; - To to; - } u{ value }; - return u.to; -} - -template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()> -CMT_GNU_CONSTEXPR CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept -{ - return vec<To, Nout>::frombits(value); -} - -template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> -constexpr CMT_INLINE To ubitcast(const From& value) noexcept -{ - return bitcast<To>(value); -} - -template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> -constexpr CMT_INLINE To ibitcast(const From& value) noexcept -{ - return bitcast<To>(value); -} - -template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> -constexpr CMT_INLINE To fbitcast(const From& value) noexcept -{ - return bitcast<To>(value); -} - -template <typename From, size_t N, typename To = utype<From>, - size_t Nout = size_of<From>() * N / size_of<To>()> -constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept -{ - return vec<To, Nout>::frombits(value); -} - -template <typename From, size_t N, typename To = itype<From>, - size_t Nout = size_of<From>() * N / size_of<To>()> -constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept -{ - return vec<To, Nout>::frombits(value); -} - -template <typename From, size_t N, typename To = ftype<From>, - size_t Nout = size_of<From>() * N / size_of<To>()> -constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept -{ - return vec<To, Nout>::frombits(value); -} - -template <typename T, size_t N> -inline bool mask<T, N>::operator[](size_t index) const noexcept -{ - return ibitcast(base::operator[](index)) < 0; -} - -constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); } - -namespace internal -{ -template <size_t start = 0, size_t stride = 1> -struct shuffle_index -{ - constexpr CMT_INLINE size_t operator()(size_t index) const { return start + index * stride; } -}; - -template <size_t count, size_t start = 0, size_t stride = 1> -struct shuffle_index_wrap -{ - constexpr inline size_t operator()(size_t index) const { return (start + index * stride) % count; } -}; -} // namespace internal - -template <size_t count, typename T, size_t N, size_t Nout = N* count> -CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<Nout>() % csize_t<N>()); -} -KFR_FN(repeat) - -template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)> -CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<Nout>() % csize_t<N>()); -} -template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)> -constexpr CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x) -{ - return x; -} -KFR_FN(resize) - -template <typename T, size_t N> -struct pkd_vec -{ - constexpr pkd_vec() noexcept {} - pkd_vec(const vec<T, N>& value) noexcept { value.write(v); } - template <typename... Ts> - constexpr pkd_vec(Ts... init) noexcept : v{ static_cast<T>(init)... } - { - static_assert(N <= sizeof...(Ts), "Too few initializers for pkd_vec"); - } - -private: - T v[N]; - friend struct vec<T, N>; -} -#ifdef CMT_GNU_ATTRIBUTES -__attribute__((packed)) -#endif -; - -namespace internal -{ - -template <size_t, typename T> -constexpr CMT_INLINE T make_vector_get_n() -{ - return T(); -} -template <size_t index, typename T, typename... Args> -constexpr CMT_INLINE T make_vector_get_n(const T& arg, const Args&... args) -{ - return index == 0 ? arg : make_vector_get_n<index - 1, T>(args...); -} - -template <typename T, typename... Args, size_t... indices, size_t N = sizeof...(Args)> -CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, const Args&... args) -{ - const T list[] = { static_cast<T>(args)... }; - return vec<T, N>(list[indices]...); -} -} // namespace internal - -/// Create vector from scalar values -/// @code -/// CHECK( make_vector( 1, 2, 3, 4 ) == i32x4{1, 2, 3, 4} ); -/// @endcode -template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1), - typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>> -constexpr CMT_INLINE vec<SubType, N> make_vector(const Arg& x, const Args&... rest) -{ - return internal::make_vector_impl<SubType>(cvalseq_t<size_t, N>(), static_cast<SubType>(x), - static_cast<SubType>(rest)...); -} -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> make_vector(const vec<T, N>& x) -{ - return x; -} -template <typename T, T... Values, size_t N = sizeof...(Values)> -constexpr CMT_INLINE vec<T, N> make_vector(cvals_t<T, Values...>) -{ - return make_vector<T>(Values...); -} -KFR_FN(make_vector) - -template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1), - typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>, - KFR_ENABLE_IF(is_number<subtype<SubType>>::value)> -constexpr CMT_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest) -{ - return internal::make_vector_impl<SubType>(csizeseq_t<N * widthof<SubType>()>(), static_cast<SubType>(x), - static_cast<SubType>(rest)...); -} -KFR_FN(pack) - -namespace operators -{ -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator+(const vec<T1, N>& x, const T2& y) -{ - return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator-(const vec<T1, N>& x, const T2& y) -{ - return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator*(const vec<T1, N>& x, const T2& y) -{ - return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator/(const vec<T1, N>& x, const T2& y) -{ - return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y); -} - -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator+(const T1& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator-(const T1& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator*(const T1& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator/(const T1& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y); -} - -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator+(const vec<T1, N>& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator-(const vec<T1, N>& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator*(const vec<T1, N>& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y); -} -template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> -constexpr CMT_INLINE vec<C, N> operator/(const vec<T1, N>& x, const vec<T2, N>& y) -{ - return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y); -} - -template <typename T1, size_t N> -constexpr CMT_INLINE vec<T1, N> operator&&(const T1& x, const vec<T1, N>& y) -{ - return static_cast<vec<T1, N>>(x) && y; -} -template <typename T1, size_t N> -constexpr CMT_INLINE vec<T1, N> operator||(const T1& x, const vec<T1, N>& y) -{ - return static_cast<vec<T1, N>>(x) || y; -} -template <typename T1, size_t N> -constexpr CMT_INLINE vec<T1, N> operator&(const T1& x, const vec<T1, N>& y) -{ - return static_cast<vec<T1, N>>(x) & y; -} -template <typename T1, size_t N> -constexpr CMT_INLINE vec<T1, N> operator|(const T1& x, const vec<T1, N>& y) -{ - return static_cast<vec<T1, N>>(x) | y; -} -template <typename T1, size_t N> -constexpr CMT_INLINE vec<T1, N> operator^(const T1& x, const vec<T1, N>& y) -{ - return static_cast<vec<T1, N>>(x) ^ y; -} -} // namespace operators - -using namespace operators; - -template <typename T, size_t N1, size_t N2 = N1> -using mat = vec<vec<T, N1>, N2>; - -namespace internal -{ - -template <size_t start, size_t count> -struct shuffle_index_extend -{ - constexpr CMT_INLINE size_t operator()(size_t index) const - { - return index >= start && index < start + count ? index - start : index_undefined; - } -}; - -template <typename T, size_t Nout, size_t N1, size_t... indices> -constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>) -{ - return make_vector(maskbits<T>(indices < N1)...); -} -template <typename T, size_t Nout, size_t N1> -constexpr vec<T, Nout> partial_mask() -{ - return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>()); -} -} // namespace internal - -template <typename T> -using optvec = vec<T, platform<T>::vector_capacity / 4>; - -using f32x1 = vec<f32, 1>; -using f32x2 = vec<f32, 2>; -using f32x3 = vec<f32, 3>; -using f32x4 = vec<f32, 4>; -using f32x8 = vec<f32, 8>; -using f32x16 = vec<f32, 16>; -using f32x32 = vec<f32, 32>; -using f32x64 = vec<f32, 64>; -using f64x1 = vec<f64, 1>; -using f64x2 = vec<f64, 2>; -using f64x3 = vec<f64, 3>; -using f64x4 = vec<f64, 4>; -using f64x8 = vec<f64, 8>; -using f64x16 = vec<f64, 16>; -using f64x32 = vec<f64, 32>; -using f64x64 = vec<f64, 64>; -using i8x1 = vec<i8, 1>; -using i8x2 = vec<i8, 2>; -using i8x3 = vec<i8, 3>; -using i8x4 = vec<i8, 4>; -using i8x8 = vec<i8, 8>; -using i8x16 = vec<i8, 16>; -using i8x32 = vec<i8, 32>; -using i8x64 = vec<i8, 64>; -using i16x1 = vec<i16, 1>; -using i16x2 = vec<i16, 2>; -using i16x3 = vec<i16, 3>; -using i16x4 = vec<i16, 4>; -using i16x8 = vec<i16, 8>; -using i16x16 = vec<i16, 16>; -using i16x32 = vec<i16, 32>; -using i16x64 = vec<i16, 64>; -using i32x1 = vec<i32, 1>; -using i32x2 = vec<i32, 2>; -using i32x3 = vec<i32, 3>; -using i32x4 = vec<i32, 4>; -using i32x8 = vec<i32, 8>; -using i32x16 = vec<i32, 16>; -using i32x32 = vec<i32, 32>; -using i32x64 = vec<i32, 64>; -using i64x1 = vec<i64, 1>; -using i64x2 = vec<i64, 2>; -using i64x3 = vec<i64, 3>; -using i64x4 = vec<i64, 4>; -using i64x8 = vec<i64, 8>; -using i64x16 = vec<i64, 16>; -using i64x32 = vec<i64, 32>; -using i64x64 = vec<i64, 64>; -using u8x1 = vec<u8, 1>; -using u8x2 = vec<u8, 2>; -using u8x3 = vec<u8, 3>; -using u8x4 = vec<u8, 4>; -using u8x8 = vec<u8, 8>; -using u8x16 = vec<u8, 16>; -using u8x32 = vec<u8, 32>; -using u8x64 = vec<u8, 64>; -using u16x1 = vec<u16, 1>; -using u16x2 = vec<u16, 2>; -using u16x3 = vec<u16, 3>; -using u16x4 = vec<u16, 4>; -using u16x8 = vec<u16, 8>; -using u16x16 = vec<u16, 16>; -using u16x32 = vec<u16, 32>; -using u16x64 = vec<u16, 64>; -using u32x1 = vec<u32, 1>; -using u32x2 = vec<u32, 2>; -using u32x3 = vec<u32, 3>; -using u32x4 = vec<u32, 4>; -using u32x8 = vec<u32, 8>; -using u32x16 = vec<u32, 16>; -using u32x32 = vec<u32, 32>; -using u32x64 = vec<u32, 64>; -using u64x1 = vec<u64, 1>; -using u64x2 = vec<u64, 2>; -using u64x3 = vec<u64, 3>; -using u64x4 = vec<u64, 4>; -using u64x8 = vec<u64, 8>; -using u64x16 = vec<u64, 16>; -using u64x32 = vec<u64, 32>; -using u64x64 = vec<u64, 64>; - -using u8x2x2 = vec<vec<u8, 2>, 2>; -using i8x2x2 = vec<vec<i8, 2>, 2>; -using u16x2x2 = vec<vec<u16, 2>, 2>; -using i16x2x2 = vec<vec<i16, 2>, 2>; -using u32x2x2 = vec<vec<u32, 2>, 2>; -using i32x2x2 = vec<vec<i32, 2>, 2>; -using u64x2x2 = vec<vec<u64, 2>, 2>; -using i64x2x2 = vec<vec<i64, 2>, 2>; -using f32x2x2 = vec<vec<f32, 2>, 2>; -using f64x2x2 = vec<vec<f64, 2>, 2>; - -using u8x4x4 = vec<vec<u8, 4>, 4>; -using i8x4x4 = vec<vec<i8, 4>, 4>; -using u16x4x4 = vec<vec<u16, 4>, 4>; -using i16x4x4 = vec<vec<i16, 4>, 4>; -using u32x4x4 = vec<vec<u32, 4>, 4>; -using i32x4x4 = vec<vec<i32, 4>, 4>; -using u64x4x4 = vec<vec<u64, 4>, 4>; -using i64x4x4 = vec<vec<i64, 4>, 4>; -using f32x4x4 = vec<vec<f32, 4>, 4>; -using f64x4x4 = vec<vec<f64, 4>, 4>; - -namespace glsl_names -{ -using vec2 = f32x2; -using vec3 = f32x3; -using vec4 = f32x4; -using dvec2 = f64x2; -using dvec3 = f64x3; -using dvec4 = f64x4; -using ivec2 = i32x2; -using ivec3 = i32x3; -using ivec4 = i32x4; -using uvec2 = u32x2; -using uvec3 = u32x3; -using uvec4 = u32x4; -} // namespace glsl_names -namespace opencl_names -{ -using char2 = i8x2; -using char3 = i8x3; -using char4 = i8x4; -using char8 = i8x8; -using char16 = i8x16; -using uchar2 = u8x2; -using uchar3 = u8x3; -using uchar4 = u8x4; -using uchar8 = u8x8; -using uchar16 = u8x16; - -using short2 = i16x2; -using short3 = i16x3; -using short4 = i16x4; -using short8 = i16x8; -using short16 = i16x16; -using ushort2 = u16x2; -using ushort3 = u16x3; -using ushort4 = u16x4; -using ushort8 = u16x8; -using ushort16 = u16x16; - -using int2 = i32x2; -using int3 = i32x3; -using int4 = i32x4; -using int8 = i32x8; -using int16 = i32x16; -using uint2 = u32x2; -using uint3 = u32x3; -using uint4 = u32x4; -using uint8 = u32x8; -using uint16 = u32x16; - -using long2 = i64x2; -using long3 = i64x3; -using long4 = i64x4; -using long8 = i64x8; -using long16 = i64x16; -using ulong2 = u64x2; -using ulong3 = u64x3; -using ulong4 = u64x4; -using ulong8 = u64x8; -using ulong16 = u64x16; - -using float2 = f32x2; -using float3 = f32x3; -using float4 = f32x4; -using float8 = f32x8; -using float16 = f32x16; - -using double2 = f64x2; -using double3 = f64x3; -using double4 = f64x4; -using double8 = f64x8; -using double16 = f64x16; -} // namespace opencl_names - -namespace internal -{ - -template <typename T, size_t N> -struct vec_type -{ - using type = vec<T, N>; -}; - -template <typename T, size_t Nmax> -struct maxvec -{ - constexpr static size_t size = Nmax; - vec<T, size> vmax; - maxvec(T initial) : vmax(initial) {} - template <int N> - vec<T, N>& v() - { - static_assert(N <= size, "N <= size"); - return reinterpret_cast<vec<T, N>&>(*this); - } - template <int N> - const vec<T, N>& v() const - { - static_assert(N <= size, "N <= size"); - return reinterpret_cast<const vec<T, N>&>(*this); - } -}; - -template <size_t Index, typename T, size_t N, typename Fn, typename... Args, - typename Tout = result_of<Fn(subtype<decay<Args>>...)>> -constexpr CMT_INLINE Tout applyfn_helper(Fn&& fn, Args&&... args) -{ - return fn(args[Index]...); -} - -template <typename T, size_t N, typename Fn, typename... Args, - typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices> -constexpr CMT_INLINE vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args) -{ - return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...); -} -template <typename T, size_t N, typename Fn, size_t... Indices> -constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>) -{ - return make_vector(((void)Indices, void(), fn())...); -} -} // namespace internal - -template <typename T, size_t N, typename Fn, typename... Args, - typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>> -constexpr CMT_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args) -{ - return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>(), arg, - std::forward<Args>(args)...); -} - -template <size_t N, typename Fn, typename T = result_of<Fn()>> -constexpr CMT_INLINE vec<T, N> apply(Fn&& fn) -{ - return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>()); -} - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_SIMD -CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); } -CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); } -#endif - -template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)> -constexpr CMT_INLINE mask<T, Nout> make_mask(bool arg, Args... args) -{ - return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...); -} -KFR_FN(make_mask) - -template <typename T, size_t N> -CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector() -{ - return vec<T, N>(czeros); -} - -template <typename T, size_t N> -CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector(vec_t<T, N>) -{ - return vec<T, N>(czeros); -} -KFR_FN(zerovector) - -template <typename T, size_t N> -CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector() -{ - return vec<T, N>(cones); -} -template <typename T, size_t N> -CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>) -{ - return vec<T, N>(cones); -} -KFR_FN(allonesvector) - -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> undefinedvector() -{ - return vec<T, N>{}; -} -template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> undefinedvector(vec_t<T, N>) -{ - return undefinedvector<T, N>(); -} -KFR_FN(undefinedvector) - -template <typename T, size_t N, size_t Nout /*= prev_poweroftwo(N - 1)*/> -CMT_INLINE vec<T, Nout> low(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<Nout>()); -} - -template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)> -CMT_INLINE vec_t<T, Nout> low(vec_t<T, N>) -{ - return {}; -} - -template <typename T, size_t N, size_t Nout /*= N - prev_poweroftwo(N - 1)*/> -CMT_INLINE vec<T, Nout> high(const vec<T, N>& x) -{ - return x.shuffle(csizeseq_t<Nout, prev_poweroftwo(N - 1)>()); -} - -template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)> -CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>) -{ - return {}; -} -KFR_FN(low) -KFR_FN(high) -} // namespace kfr - -namespace cometa -{ - -template <typename T, size_t N> -struct compound_type_traits<kfr::vec_t<T, N>> -{ - constexpr static size_t width = N; - constexpr static size_t deep_width = width * compound_type_traits<T>::width; - using subtype = T; - using deep_subtype = cometa::deep_subtype<T>; - constexpr static bool is_scalar = false; - constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; - - template <typename U> - using rebind = kfr::vec_t<U, N>; - template <typename U> - using deep_rebind = kfr::vec_t<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; -}; - -template <typename T, size_t N> -struct compound_type_traits<kfr::vec<T, N>> -{ - using subtype = T; - using deep_subtype = cometa::deep_subtype<T>; - constexpr static size_t width = N; - constexpr static size_t deep_width = width * compound_type_traits<T>::width; - constexpr static bool is_scalar = false; - constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; - template <typename U> - using rebind = kfr::vec<U, N>; - template <typename U> - using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; - - CMT_INLINE static constexpr subtype at(const kfr::vec<T, N>& value, size_t index) { return value[index]; } -}; - -template <typename T, size_t N> -struct compound_type_traits<kfr::mask<T, N>> -{ - using subtype = T; - using deep_subtype = cometa::deep_subtype<T>; - constexpr static size_t width = N; - constexpr static size_t deep_width = width * compound_type_traits<T>::width; - constexpr static bool is_scalar = false; - constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; - template <typename U> - using rebind = kfr::mask<U, N>; - template <typename U> - using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; - - CMT_INLINE static constexpr subtype at(const kfr::mask<T, N>& value, size_t index) - { - return value[index]; - } -}; -} // namespace cometa - -namespace std -{ -template <typename T1, typename T2, size_t N> -struct common_type<kfr::vec<T1, N>, kfr::vec<T2, N>> -{ - using type = kfr::vec<typename common_type<T1, T2>::type, N>; -}; -template <typename T1, typename T2, size_t N> -struct common_type<kfr::vec<T1, N>, T2> -{ - using type = kfr::vec<typename common_type<T1, T2>::type, N>; -}; -template <typename T1, typename T2, size_t N> -struct common_type<T1, kfr::vec<T2, N>> -{ - using type = kfr::vec<typename common_type<T1, T2>::type, N>; -}; -template <typename T1, typename T2, size_t N1, size_t N2> -struct common_type<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>> -{ - using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>; -}; -template <typename T1, typename T2, size_t N1, size_t N2> -struct common_type<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>> -{ - using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>; -}; - -template <typename T1, typename T2, size_t N> -struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>> -{ - using type = kfr::mask<typename common_type<T1, T2>::type, N>; -}; -} // namespace std - -CMT_PRAGMA_GNU(GCC diagnostic pop) -CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -16,8 +16,10 @@ extern char* gets(char* __s); #ifdef CMT_ARCH_X86 #if defined(_M_X64) || defined(__x86_64__) #define CMT_ARCH_X64 1 +#define CMT_ARCH_BITNESS_NAME "64-bit" #else #define CMT_ARCH_X32 1 +#define CMT_ARCH_BITNESS_NAME "32-bit" #endif #ifndef CMT_FORCE_GENERIC_CPU @@ -133,8 +135,10 @@ extern char* gets(char* __s); #if defined(__aarch64__) #define CMT_ARCH_X64 1 +#define CMT_ARCH_BITNESS_NAME "64-bit" #else #define CMT_ARCH_X32 1 +#define CMT_ARCH_BITNESS_NAME "32-bit" #endif #ifdef __ARM_NEON__ @@ -146,22 +150,22 @@ extern char* gets(char* __s); #else #define CMT_ARCH_NEON 1 #define CMT_ARCH_NAME neon -#define KFR_NO_NATIVE_F64 1 +#define CMT_NO_NATIVE_F64 1 #endif #endif #endif #ifndef CMT_ARCH_NAME -#define CMT_ARCH_NAME common +#define CMT_ARCH_NAME generic #endif -#ifndef KFR_NO_NATIVE_F64 -#define KFR_NATIVE_F64 1 +#ifndef CMT_NO_NATIVE_F64 +#define CMT_NATIVE_F64 1 #endif -#ifndef KFR_NO_NATIVE_I64 -#define KFR_NATIVE_I64 1 +#ifndef CMT_NO_NATIVE_I64 +#define CMT_NATIVE_I64 1 #endif #define CMT_STRINGIFY2(x) #x @@ -250,28 +254,29 @@ extern char* gets(char* __s); #define CMT_ALWAYS_INLINE #endif #define CMT_INLINE __inline__ CMT_ALWAYS_INLINE -#define CMT_INTRIN CMT_INLINE CMT_NODEBUG #define CMT_INLINE_MEMBER CMT_ALWAYS_INLINE #define CMT_INLINE_LAMBDA CMT_INLINE_MEMBER #define CMT_NOINLINE __attribute__((__noinline__)) #define CMT_FLATTEN __attribute__((__flatten__)) #define CMT_RESTRICT __restrict__ -#define CMT_FUNC __inline__ #elif defined(CMT_MSVC_ATTRIBUTES) +#define CMT_ALWAYS_INLINE __forceinline #define CMT_NODEBUG #define CMT_INLINE /*inline*/ __forceinline -#define CMT_INTRIN CMT_INLINE CMT_NODEBUG #define CMT_INLINE_MEMBER __forceinline #define CMT_INLINE_LAMBDA #define CMT_NOINLINE __declspec(noinline) #define CMT_FLATTEN #define CMT_RESTRICT __restrict -#define CMT_FUNC inline #endif +#define CMT_INTRINSIC CMT_INLINE CMT_NODEBUG +#define CMT_MEM_INTRINSIC CMT_INLINE CMT_NODEBUG +#define CMT_FUNCTION inline + #if defined _MSC_VER && _MSC_VER >= 1900 && \ (!defined(__clang__) || \ (defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9)))) @@ -386,8 +391,10 @@ extern char* gets(char* __s); #if CMT_HAS_NOEXCEPT #define CMT_NOEXCEPT noexcept +#define CMT_NOEXCEPT_SPEC(...) noexcept(__VA_ARGS__) #else #define CMT_NOEXCEPT +#define CMT_NOEXCEPT_SPEC(...) #endif #if CMT_COMPILER_GNU && !defined(__EXCEPTIONS) @@ -491,16 +498,55 @@ extern char* gets(char* __s); #define CMT_OS_NAME "unknown" #endif -#if defined CMT_COMPILER_CLANG +#if defined CMT_COMPILER_INTEL #if defined _MSC_VER -#define CMT_COMPIER_NAME "clang-msvc" +#define CMT_COMPILER_NAME "intel-msvc" +#define CMT_COMPILER_FULL_NAME \ + "clang-msvc-" CMT_STRINGIFY(__ICL) "." CMT_STRINGIFY(__INTEL_COMPILER_UPDATE) "." CMT_STRINGIFY( \ + __INTEL_COMPILER_BUILD_DATE) +#else +#define CMT_COMPILER_NAME "intel" +#ifdef __INTEL_CLANG_COMPILER +#define CMT_COMPILER_INTEL_SPEC "-clang" +#ifdef __INTEL_LLVM_COMPILER +#define CMT_COMPILER_INTEL_SPEC "-clang-llvm" +#endif #else -#define CMT_COMPIER_NAME "clang" +#ifdef __INTEL_LLVM_COMPILER +#define CMT_COMPILER_INTEL_SPEC "-llvm" +#else +#define CMT_COMPILER_INTEL_SPEC "" +#endif +#endif +#define CMT_COMPILER_FULL_NAME \ + "intel-" CMT_STRINGIFY(__INTEL_COMPILER) CMT_COMPILER_INTEL_SPEC \ + "." CMT_STRINGIFY(__INTEL_COMPILER_UPDATE) "." CMT_STRINGIFY(__INTEL_COMPILER_BUILD_DATE) +#endif +#elif defined CMT_COMPILER_CLANG +#if defined _MSC_VER +#define CMT_COMPILER_NAME "clang-msvc" +#define CMT_COMPILER_FULL_NAME \ + "clang-msvc-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \ + __clang_patchlevel__) +#else +#define CMT_COMPILER_NAME "clang" +#define CMT_COMPILER_FULL_NAME \ + "clang-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \ + __clang_patchlevel__) #endif #elif defined CMT_COMPILER_GCC -#define CMT_COMPIER_NAME "gcc" +#define CMT_COMPILER_NAME "gcc" +#define CMT_COMPILER_FULL_NAME \ + "gcc-" CMT_STRINGIFY(__GNUC__) "." CMT_STRINGIFY(__GNUC_MINOR__) "." CMT_STRINGIFY(__GNUC_PATCHLEVEL__) #elif defined CMT_COMPILER_MSVC -#define CMT_COMPIER_NAME "msvc" +#define CMT_COMPILER_NAME "msvc" +#define CMT_COMPILER_FULL_NAME "msvc-" CMT_STRINGIFY(_MSC_VER) "." CMT_STRINGIFY(_MSC_FULL_VER) #else -#define CMT_COMPIER_NAME "unknown" +#define CMT_COMPILER_NAME "unknown" +#define CMT_COMPILER_FULL_NAME "unknown" #endif + +#define CMT_CONCAT(a, b) a##b + +#define CMT_NARGS2(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, ...) _10 +#define CMT_NARGS(...) CMT_NARGS2(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp @@ -8,11 +8,15 @@ #include <cstdint> #include <cstdlib> #include <limits> +#include <random> #include <type_traits> #include <utility> CMT_PRAGMA_GNU(GCC diagnostic push) CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunknown-warning-option") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized") CMT_PRAGMA_MSVC(warning(push)) CMT_PRAGMA_MSVC(warning(disable : 4814)) @@ -26,13 +30,13 @@ using std::size_t; #if __cplusplus >= 201103L || CMT_MSC_VER >= 1900 || CMT_HAS_FEATURE(cxx_constexpr) template <typename T, size_t N> -constexpr inline static size_t arraysize(const T (&)[N]) noexcept +constexpr inline static size_t arraysize(const T (&)[N]) CMT_NOEXCEPT { return N; } template <typename T, size_t N> -constexpr inline static std::integral_constant<size_t, N> carraysize(const T (&)[N]) noexcept +constexpr inline static std::integral_constant<size_t, N> carraysize(const T (&)[N]) CMT_NOEXCEPT { return {}; } @@ -173,9 +177,6 @@ using is_template_arg = std::integral_constant<bool, std::is_integral<T>::value template <typename T> using decay = typename std::decay<T>::type; -template <typename... T> -using decay_common = decay<common_type<T...>>; - template <typename T1, typename T2 = void, typename... Ts> constexpr size_t typeindex() { @@ -253,7 +254,7 @@ namespace ops { struct empty { - constexpr empty() noexcept {} + constexpr empty() CMT_NOEXCEPT {} }; } // namespace ops @@ -261,9 +262,9 @@ template <typename T, T val> struct cval_t : ops::empty { constexpr static T value = val; - constexpr cval_t() noexcept {} - constexpr cval_t(const cval_t&) noexcept = default; - constexpr cval_t(cval_t&&) noexcept = default; + constexpr cval_t() CMT_NOEXCEPT {} + constexpr cval_t(const cval_t&) CMT_NOEXCEPT = default; + constexpr cval_t(cval_t&&) CMT_NOEXCEPT = default; typedef T value_type; typedef cval_t type; constexpr operator value_type() const { return value; } @@ -386,6 +387,8 @@ struct get_nth_type<index> template <typename T, T... values> struct cvals_t : ops::empty { + constexpr cvals_t() CMT_NOEXCEPT = default; + using type = cvals_t<T, values...>; constexpr static size_t size() { return sizeof...(values); } template <size_t index> @@ -413,12 +416,13 @@ struct cvals_t : ops::empty constexpr cvals_t<T, details::get_nth_e<indices, type>::value...> operator[]( cvals_t<size_t, indices...>) const { + // static_assert(sizeof(T)==0, "+++++++++++++++++++++++++++++"); return {}; } // MSVC requires static_cast<T> here: template <typename Fn> - constexpr auto map(Fn&& fn) -> cvals_t<T, static_cast<T>(Fn()(values))...> + constexpr auto map(Fn&&) const -> cvals_t<T, static_cast<T>(Fn()(values))...> { return {}; } @@ -487,6 +491,10 @@ constexpr inline T cprod(cvals_t<T, first, rest...>) template <typename T> struct ctype_t { +#ifdef CMT_COMPILER_INTEL + constexpr ctype_t() CMT_NOEXCEPT = default; + constexpr ctype_t(const ctype_t&) CMT_NOEXCEPT = default; +#endif using type = T; }; @@ -510,9 +518,15 @@ struct ctypes_t namespace details { -template <typename T1, typename T2> +template <typename T1, typename... Ts> struct concat_impl; +template <typename T> +struct concat_impl<T> +{ + using type = T; +}; + template <typename T, T... values1, T... values2> struct concat_impl<cvals_t<T, values1...>, cvals_t<T, values2...>> { @@ -523,12 +537,19 @@ struct concat_impl<ctypes_t<types1...>, ctypes_t<types2...>> { using type = ctypes_t<types1..., types2...>; }; + +template <typename T1, typename T2, typename T3, typename... Ts> +struct concat_impl<T1, T2, T3, Ts...> +{ + using type = typename concat_impl<typename concat_impl<T1, T2>::type, T3, Ts...>::type; +}; + } // namespace details -template <typename T1, typename T2> -using concat_lists = typename details::concat_impl<T1, T2>::type; +template <typename T1, typename... Ts> +using concat_lists = typename details::concat_impl<decay<T1>, decay<Ts>...>::type; -template <typename T1, typename T2> -constexpr inline concat_lists<T1, T2> cconcat(T1, T2) +template <typename T1, typename... Ts> +constexpr inline concat_lists<T1, Ts...> cconcat(T1, Ts...) { return {}; } @@ -584,7 +605,7 @@ template <typename Fn> using function_result = typename details::function_arguments_impl<decltype(&Fn::operator())>::result; template <typename T1, typename T2> -using cfilter_t = typename details::filter_impl<T1, T2>::type; +using cfilter_t = typename details::filter_impl<decay<T1>, decay<T2>>::type; template <typename T, T... vals, bool... flags, typename Ret = cfilter_t<cvals_t<T, vals...>, cvals_t<bool, flags...>>> @@ -659,15 +680,13 @@ CMT_BIN_OP(^) namespace details { -template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep> -struct cvalseq_impl; - -template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep> -using cgen_seq = typename cvalseq_impl<T, Nsize, Nstart, Nstep>::type; template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep> -struct cvalseq_impl : concat_impl<cgen_seq<T, Nsize / 2, Nstart, Nstep>, - cgen_seq<T, Nsize - Nsize / 2, Nstart + (Nsize / 2) * Nstep, Nstep>> +struct cvalseq_impl + : concat_impl<typename cvalseq_impl<T, Nsize / 2, Nstart, Nstep>::type, + typename cvalseq_impl<T, Nsize - Nsize / 2, + static_cast<T>(Nstart + static_cast<ptrdiff_t>(Nsize / 2) * Nstep), + Nstep>::type> { }; @@ -679,6 +698,10 @@ template <typename T, T Nstart, ptrdiff_t Nstep> struct cvalseq_impl<T, 1, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart)> { }; +template <typename T, T Nstart, ptrdiff_t Nstep> +struct cvalseq_impl<T, 2, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart), static_cast<T>(Nstart + Nstep)> +{ +}; } // namespace details template <typename T, size_t size, T start = T(), ptrdiff_t step = 1> @@ -691,9 +714,11 @@ template <typename... List> using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>; template <size_t group, size_t... indices, size_t N = group * sizeof...(indices)> -constexpr inline auto scale(csizes_t<indices...> i) noexcept +constexpr inline auto scale(csizes_t<indices...> i) CMT_NOEXCEPT { - return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() % csize_t<group>(); + return cconcat(csizeseq_t<group, group * indices>()...); + // return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() % + // csize_t<group>(); } namespace details @@ -814,12 +839,14 @@ constexpr inline unsigned ilog2(T n, unsigned p = 0) return (n <= 1) ? p : ilog2(n / 2, p + 1); } +/// @brief Returns a nearest power of two that is greater or equal than n template <typename T> constexpr inline T next_poweroftwo(T n) { return n > 2 ? T(1) << (ilog2(n - 1) + 1) : n; } +/// @brief Returns a nearest power of two that is less or equal than n template <typename T> constexpr inline T prev_poweroftwo(T n) { @@ -1007,7 +1034,7 @@ template <> constexpr size_t elementsize<void>() { return 1; -}; +} } // namespace details /// @brief Utility typedef used to disable type deduction @@ -1018,7 +1045,7 @@ using identity = typename details::identity_impl<T>::type; struct swallow { template <typename... T> - CMT_INTRIN constexpr swallow(T&&...) noexcept + CMT_MEM_INTRINSIC constexpr swallow(T&&...) CMT_NOEXCEPT { } }; @@ -1029,52 +1056,52 @@ struct carray; template <typename T> struct carray<T, 1> { - CMT_INTRIN constexpr carray() noexcept = default; - CMT_INTRIN constexpr carray(T val) noexcept : val(val) {} + CMT_MEM_INTRINSIC constexpr carray() CMT_NOEXCEPT = default; + CMT_MEM_INTRINSIC constexpr carray(T val) CMT_NOEXCEPT : val(val) {} template <typename Fn, size_t index = 0, CMT_ENABLE_IF(is_callable<Fn, csize_t<index>>::value)> - CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept + CMT_MEM_INTRINSIC constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) CMT_NOEXCEPT : val(static_cast<T>(fn(csize_t<index>()))) { } - CMT_INTRIN constexpr carray(const carray&) noexcept = default; - CMT_INTRIN constexpr carray(carray&&) noexcept = default; - CMT_INTRIN static constexpr size_t size() noexcept { return 1; } + CMT_MEM_INTRINSIC constexpr carray(const carray&) CMT_NOEXCEPT = default; + CMT_MEM_INTRINSIC constexpr carray(carray&&) CMT_NOEXCEPT = default; + CMT_MEM_INTRINSIC static constexpr size_t size() CMT_NOEXCEPT { return 1; } template <size_t index> - CMT_INTRIN constexpr T& get(csize_t<index>) noexcept + CMT_MEM_INTRINSIC constexpr T& get(csize_t<index>) CMT_NOEXCEPT { static_assert(index == 0, "carray: Array index is out of range"); return val; } template <size_t index> - CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept + CMT_MEM_INTRINSIC constexpr const T& get(csize_t<index>) const CMT_NOEXCEPT { static_assert(index == 0, "carray: Array index is out of range"); return val; } template <size_t index> - CMT_INTRIN constexpr T& get() noexcept + CMT_MEM_INTRINSIC constexpr T& get() CMT_NOEXCEPT { return get(csize_t<index>()); } template <size_t index> - CMT_INTRIN constexpr const T& get() const noexcept + CMT_MEM_INTRINSIC constexpr const T& get() const CMT_NOEXCEPT { return get(csize_t<index>()); } - CMT_INTRIN constexpr const T* front() const noexcept { return val; } - CMT_INTRIN constexpr T* front() noexcept { return val; } - CMT_INTRIN constexpr const T* back() const noexcept { return val; } - CMT_INTRIN constexpr T* back() noexcept { return val; } - CMT_INTRIN constexpr const T* begin() const noexcept { return &val; } - CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; } - CMT_INTRIN constexpr T* begin() noexcept { return &val; } - CMT_INTRIN constexpr T* end() noexcept { return &val + 1; } - CMT_INTRIN constexpr const T* data() const noexcept { return begin(); } - CMT_INTRIN constexpr T* data() noexcept { return begin(); } - CMT_INTRIN constexpr bool empty() const noexcept { return false; } + CMT_MEM_INTRINSIC constexpr const T* front() const CMT_NOEXCEPT { return val; } + CMT_MEM_INTRINSIC constexpr T* front() CMT_NOEXCEPT { return val; } + CMT_MEM_INTRINSIC constexpr const T* back() const CMT_NOEXCEPT { return val; } + CMT_MEM_INTRINSIC constexpr T* back() CMT_NOEXCEPT { return val; } + CMT_MEM_INTRINSIC constexpr const T* begin() const CMT_NOEXCEPT { return &val; } + CMT_MEM_INTRINSIC constexpr const T* end() const CMT_NOEXCEPT { return &val + 1; } + CMT_MEM_INTRINSIC constexpr T* begin() CMT_NOEXCEPT { return &val; } + CMT_MEM_INTRINSIC constexpr T* end() CMT_NOEXCEPT { return &val + 1; } + CMT_MEM_INTRINSIC constexpr const T* data() const CMT_NOEXCEPT { return begin(); } + CMT_MEM_INTRINSIC constexpr T* data() CMT_NOEXCEPT { return begin(); } + CMT_MEM_INTRINSIC constexpr bool empty() const CMT_NOEXCEPT { return false; } T val; }; @@ -1082,55 +1109,56 @@ template <typename T, size_t N> struct carray : carray<T, N - 1> { template <typename... Ts> - CMT_INTRIN constexpr carray(T first, Ts... list) noexcept : carray<T, N - 1>(list...), val(first) + CMT_MEM_INTRINSIC constexpr carray(T first, Ts... list) CMT_NOEXCEPT : carray<T, N - 1>(list...), + val(first) { static_assert(sizeof...(list) + 1 == N, "carray: Argument count is invalid"); } template <typename Fn, size_t index = N - 1> - CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept + CMT_MEM_INTRINSIC constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) CMT_NOEXCEPT : carray<T, N - 1>(std::forward<Fn>(fn), csize_t<index - 1>()), val(static_cast<T>(fn(csize_t<index>()))) { } - CMT_INTRIN constexpr carray() noexcept = default; - CMT_INTRIN constexpr carray(const carray&) noexcept = default; - CMT_INTRIN constexpr carray(carray&&) noexcept = default; - CMT_INTRIN static constexpr size_t size() noexcept { return N; } - CMT_INTRIN constexpr T& get(csize_t<N - 1>) noexcept { return val; } + CMT_MEM_INTRINSIC constexpr carray() CMT_NOEXCEPT = default; + CMT_MEM_INTRINSIC constexpr carray(const carray&) CMT_NOEXCEPT = default; + CMT_MEM_INTRINSIC constexpr carray(carray&&) CMT_NOEXCEPT = default; + CMT_MEM_INTRINSIC static constexpr size_t size() CMT_NOEXCEPT { return N; } + CMT_MEM_INTRINSIC constexpr T& get(csize_t<N - 1>) CMT_NOEXCEPT { return val; } template <size_t index> - CMT_INTRIN constexpr T& get(csize_t<index>) noexcept + CMT_MEM_INTRINSIC constexpr T& get(csize_t<index>) CMT_NOEXCEPT { return carray<T, N - 1>::get(csize_t<index>()); } - CMT_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; } + CMT_MEM_INTRINSIC constexpr const T& get(csize_t<N - 1>) const CMT_NOEXCEPT { return val; } template <size_t index> - CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept + CMT_MEM_INTRINSIC constexpr const T& get(csize_t<index>) const CMT_NOEXCEPT { return carray<T, N - 1>::get(csize_t<index>()); } template <size_t index> - CMT_INTRIN constexpr T& get() noexcept + CMT_MEM_INTRINSIC constexpr T& get() CMT_NOEXCEPT { return get(csize_t<index>()); } template <size_t index> - CMT_INTRIN constexpr const T& get() const noexcept + CMT_MEM_INTRINSIC constexpr const T& get() const CMT_NOEXCEPT { return get(csize_t<index>()); } - CMT_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); } - CMT_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); } - CMT_INTRIN constexpr const T* back() const noexcept { return val; } - CMT_INTRIN constexpr T* back() noexcept { return val; } - CMT_INTRIN constexpr const T* begin() const noexcept { return carray<T, N - 1>::begin(); } - CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; } - CMT_INTRIN constexpr T* begin() noexcept { return carray<T, N - 1>::begin(); } - CMT_INTRIN constexpr T* end() noexcept { return &val + 1; } - CMT_INTRIN constexpr const T* data() const noexcept { return begin(); } - CMT_INTRIN constexpr T* data() noexcept { return begin(); } - CMT_INTRIN constexpr bool empty() const noexcept { return false; } + CMT_MEM_INTRINSIC constexpr const T* front() const CMT_NOEXCEPT { return carray<T, N - 1>::front(); } + CMT_MEM_INTRINSIC constexpr T* front() CMT_NOEXCEPT { return carray<T, N - 1>::front(); } + CMT_MEM_INTRINSIC constexpr const T* back() const CMT_NOEXCEPT { return val; } + CMT_MEM_INTRINSIC constexpr T* back() CMT_NOEXCEPT { return val; } + CMT_MEM_INTRINSIC constexpr const T* begin() const CMT_NOEXCEPT { return carray<T, N - 1>::begin(); } + CMT_MEM_INTRINSIC constexpr const T* end() const CMT_NOEXCEPT { return &val + 1; } + CMT_MEM_INTRINSIC constexpr T* begin() CMT_NOEXCEPT { return carray<T, N - 1>::begin(); } + CMT_MEM_INTRINSIC constexpr T* end() CMT_NOEXCEPT { return &val + 1; } + CMT_MEM_INTRINSIC constexpr const T* data() const CMT_NOEXCEPT { return begin(); } + CMT_MEM_INTRINSIC constexpr T* data() CMT_NOEXCEPT { return begin(); } + CMT_MEM_INTRINSIC constexpr bool empty() const CMT_NOEXCEPT { return false; } private: T val; @@ -1162,45 +1190,52 @@ private: /// @brief Function that returns its first argument template <typename T> -CMT_INTRIN constexpr T&& pass_through(T&& x) noexcept +CMT_INTRINSIC constexpr T&& pass_through(T&& x) CMT_NOEXCEPT { return std::forward<T>(x); } /// @brief Function that returns void and ignores all its arguments template <typename... Ts> -CMT_INTRIN constexpr void noop(Ts&&...) noexcept +CMT_INTRINSIC constexpr void noop(Ts&&...) CMT_NOEXCEPT { } /// @brief Function that returns its first argument and ignores all other arguments template <typename T1, typename... Ts> -CMT_INTRIN constexpr T1&& get_first(T1&& x, Ts&&...) noexcept +CMT_INTRINSIC constexpr T1&& get_first(T1&& x, Ts&&...) CMT_NOEXCEPT { return std::forward<T1>(x); } /// @brief Function that returns its second argument and ignores all other arguments template <typename T1, typename T2, typename... Ts> -CMT_INTRIN constexpr T2&& get_second(T1, T2&& x, Ts&&...) noexcept +CMT_INTRINSIC constexpr T2&& get_second(T1, T2&& x, Ts&&...) CMT_NOEXCEPT { return std::forward<T2>(x); } /// @brief Function that returns its third argument and ignores all other arguments template <typename T1, typename T2, typename T3, typename... Ts> -CMT_INTRIN constexpr T3&& get_third(T1&&, T2&&, T3&& x, Ts&&...) noexcept +CMT_INTRINSIC constexpr T3&& get_third(T1&&, T2&&, T3&& x, Ts&&...) CMT_NOEXCEPT { return std::forward<T3>(x); } /// @brief Function that returns value-initialization of type T and ignores all its arguments template <typename T, typename... Ts> -CMT_INTRIN constexpr T returns(Ts&&...) +CMT_INTRINSIC constexpr T returns(Ts&&...) { return T(); } +/// @brief Function that returns constant of type T and ignores all its arguments +template <typename T, T value, typename... Args> +CMT_INTRINSIC constexpr T return_constant(Args&&...) +{ + return value; +} + CMT_FN(pass_through) CMT_FN(noop) CMT_FN(get_first) @@ -1208,33 +1243,43 @@ CMT_FN(get_second) CMT_FN(get_third) CMT_FN_TPL((typename T), (T), returns) +template <typename T, T value> +struct fn_return_constant +{ + template <typename... Args> + constexpr T operator()(Args&&...) const noexcept + { + return value; + } +}; + template <typename T1, typename T2> -CMT_INTRIN bool is_equal(const T1& x, const T2& y) +CMT_INTRINSIC bool is_equal(const T1& x, const T2& y) { return x == y; } template <typename T1, typename T2> -CMT_INTRIN bool is_notequal(const T1& x, const T2& y) +CMT_INTRINSIC bool is_notequal(const T1& x, const T2& y) { return x != y; } template <typename T1, typename T2> -CMT_INTRIN bool is_less(const T1& x, const T2& y) +CMT_INTRINSIC bool is_less(const T1& x, const T2& y) { return x < y; } template <typename T1, typename T2> -CMT_INTRIN bool is_greater(const T1& x, const T2& y) +CMT_INTRINSIC bool is_greater(const T1& x, const T2& y) { return x > y; } template <typename T1, typename T2> -CMT_INTRIN bool is_lessorequal(const T1& x, const T2& y) +CMT_INTRINSIC bool is_lessorequal(const T1& x, const T2& y) { return x <= y; } template <typename T1, typename T2> -CMT_INTRIN bool is_greaterorequal(const T1& x, const T2& y) +CMT_INTRINSIC bool is_greaterorequal(const T1& x, const T2& y) { return x >= y; } @@ -1313,7 +1358,7 @@ void cforeach_impl(Fn&& fn) #endif template <typename T, T... values, typename Fn> -CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn) +CMT_INTRINSIC void cforeach(cvals_t<T, values...>, Fn&& fn) { #ifdef CMT_COMPILER_CLANG swallow{ (fn(cval_t<T, values>()), void(), 0)... }; @@ -1323,7 +1368,7 @@ CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn) } template <typename T, typename Fn, CMT_ENABLE_IF(has_begin_end<T>::value)> -CMT_INTRIN void cforeach(T&& list, Fn&& fn) +CMT_INTRINSIC void cforeach(T&& list, Fn&& fn) { for (const auto& v : list) { @@ -1332,7 +1377,7 @@ CMT_INTRIN void cforeach(T&& list, Fn&& fn) } template <typename T, size_t N, typename Fn> -CMT_INTRIN void cforeach(const T (&array)[N], Fn&& fn) +CMT_INTRINSIC void cforeach(const T (&array)[N], Fn&& fn) { for (size_t i = 0; i < N; i++) { @@ -1344,59 +1389,94 @@ namespace details { template <size_t index, typename... types> -CMT_INTRIN auto get_type_arg(ctypes_t<types...>) +CMT_INTRINSIC auto get_type_arg(ctypes_t<types...>) { return ctype_t<type_of<details::get_nth_type<index, types...>>>(); } template <typename T0, typename... types, typename Fn, size_t... indices> -CMT_INTRIN void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, csizes_t<indices...>) +CMT_INTRINSIC void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, csizes_t<indices...>) { swallow{ (fn(get_type_arg<indices>(type_list)), void(), 0)... }; } +template <typename Fn> +CMT_INTRINSIC void cforeach_types_impl(ctypes_t<>, Fn&&, csizes_t<>) +{ +} } // namespace details template <typename... Ts, typename Fn> -CMT_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn) +CMT_INTRINSIC void cforeach(ctypes_t<Ts...> types, Fn&& fn) { details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq_t<sizeof...(Ts)>()); } template <typename A0, typename A1, typename Fn> -CMT_INTRIN void cforeach(A0&& a0, A1&& a1, Fn&& fn) +CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, Fn&& fn) { - cforeach(std::forward<A0>(a0), - [&](auto v0) { cforeach(std::forward<A1>(a1), [&](auto v1) { fn(v0, v1); }); }); + // Default capture causes ICE in Intel C++ + cforeach(std::forward<A0>(a0), // + [&a1, &fn](auto v0) { // + cforeach(std::forward<A1>(a1), // + [&v0, &fn](auto v1) { fn(v0, v1); }); + }); } template <typename A0, typename A1, typename A2, typename Fn> -CMT_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn) -{ - cforeach(std::forward<A0>(a0), [&](auto v0) { - cforeach(std::forward<A1>(a1), - [&](auto v1) { cforeach(std::forward<A2>(a2), [&](auto v2) { fn(v0, v1, v2); }); }); - }); +CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn) +{ + // Default capture causes ICE in Intel C++ + cforeach(std::forward<A0>(a0), // + [&a1, &a2, &fn](auto v0) { // + cforeach(std::forward<A1>(a1), // + [&v0, &a2, &fn](auto v1) { // + cforeach(std::forward<A2>(a2), // + [&v0, &v1, &fn](auto v2) { // + fn(v0, v1, v2); + }); + }); + }); +} + +template <typename A0, typename A1, typename A2, typename A3, typename Fn> +CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, A2&& a2, A3&& a3, Fn&& fn) +{ + // Default capture causes ICE in Intel C++ + cforeach(std::forward<A0>(a0), // + [&a1, &a2, &a3, &fn](auto v0) { // + cforeach(std::forward<A1>(a1), // + [&v0, &a2, &a3, &fn](auto v1) { // + cforeach(std::forward<A2>(a2), // + [&v0, &v1, &a3, &fn](auto v2) { // + cforeach(std::forward<A3>(a3), // + [&v0, &v1, &v2, &fn](auto v3) // + { fn(v0, v1, v2, v3); }); + }); + }); + }); } + template <typename TrueFn, typename FalseFn = fn_noop> -CMT_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn()) +CMT_INTRINSIC decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn()) { return truefn(ctrue); } template <typename TrueFn, typename FalseFn = fn_noop> -CMT_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn()) +CMT_INTRINSIC decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn()) { return falsefn(cfalse); } template <typename T, T start, T stop, typename BodyFn> -CMT_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn) +CMT_INTRINSIC decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn) { return cforeach(cvalseq_t<T, stop - start, start>(), std::forward<BodyFn>(bodyfn)); } template <typename T, T... vs, typename U, typename Function, typename Fallback = fn_noop> -void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& fallback = Fallback()) +CMT_INTRINSIC void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, + Fallback&& fallback = Fallback()) { bool result = false; swallow{ (result = result || ((vs == value) ? (function(cval_t<T, vs>()), void(), true) : false), void(), @@ -1406,14 +1486,15 @@ void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& } template <typename T, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal> -CMT_INTRIN decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(), CmpFn&& = CmpFn()) +CMT_INTRINSIC decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(), + CmpFn&& = CmpFn()) { return deffn(); } template <typename T, T v0, T... values, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal> -CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn, - DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn()) +CMT_INTRINSIC decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn, + DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn()) { if (cmpfn(value, v0)) { @@ -1428,7 +1509,6 @@ CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, namespace details { - template <typename T, typename Fn1, typename Fn2, typename... Fns> inline decltype(auto) cmatch_impl(T&& value, Fn1&& first, Fn2&& second, Fns&&... rest); template <typename T, typename Fn, typename... Ts> @@ -1491,15 +1571,15 @@ template <typename Fn> struct fn_noinline { template <typename... Args> - CMT_INTRIN result_of<Fn(Args...)> operator()(Args&&... args) const + CMT_MEM_INTRINSIC result_of<Fn(Args...)> operator()(Args&&... args) const { return noinline(Fn{}, std::forward<Args>(args)...); } -}; +}; // namespace cometa template <typename... Args, typename Fn, typename Ret = decltype(std::declval<Fn>()(std::declval<Args>()...)), typename NonMemFn = Ret (*)(Fn*, Args...)> -CMT_INTRIN NonMemFn make_nonmember(const Fn&) +CMT_INTRINSIC NonMemFn make_nonmember(const Fn&) { return [](Fn* fn, Args... args) -> Ret { return fn->operator()(std::forward<Args>(args)...); }; } @@ -1510,6 +1590,11 @@ constexpr inline T choose_const() static_assert(sizeof(T) != 0, "T not found in the list of template arguments"); return T(); } +template <typename T, typename C1> +constexpr inline T choose_const_fallback(C1 c1) +{ + return static_cast<T>(c1); +} /** * Selects constant of the specific type @@ -1518,10 +1603,21 @@ constexpr inline T choose_const() * CHECK( choose_const<f64>( 32.0f, 64.0 ) == 64.0 ); * @endcode */ +template <typename T, typename C1, typename... Cs, CMT_ENABLE_IF(std::is_same<T, C1>::value)> +constexpr inline T choose_const(C1 c1, Cs...) +{ + return static_cast<T>(c1); +} +template <typename T, typename C1, typename... Cs, CMT_ENABLE_IF(!std::is_same<T, C1>::value)> +constexpr inline T choose_const(C1, Cs... constants) +{ + return choose_const<T>(constants...); +} + template <typename T, typename C1, typename... Cs> -constexpr inline T choose_const(C1 c1, Cs... constants) +constexpr inline T choose_const_fallback(C1 c1, Cs... constants) { - return std::is_same<T, C1>::value ? static_cast<T>(c1) : choose_const<T>(constants...); + return std::is_same<T, C1>::value ? static_cast<T>(c1) : choose_const_fallback<T>(constants...); } template <typename Tfrom> @@ -1529,14 +1625,14 @@ struct autocast_impl { const Tfrom value; template <typename T> - CMT_INTRIN constexpr operator T() const noexcept + CMT_MEM_INTRINSIC constexpr operator T() const CMT_NOEXCEPT { return static_cast<T>(value); } }; template <typename Tfrom> -CMT_INTRIN constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) noexcept +CMT_INTRINSIC constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) CMT_NOEXCEPT { return { value }; } @@ -1603,49 +1699,49 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wundefined-reinterpret-cast") #endif template <typename T, typename U> -CMT_INLINE constexpr static T& ref_cast(U& ptr) +CMT_INTRINSIC constexpr static T& ref_cast(U& ptr) { return reinterpret_cast<T&>(ptr); } template <typename T, typename U> -CMT_INLINE constexpr static const T& ref_cast(const U& ptr) +CMT_INTRINSIC constexpr static const T& ref_cast(const U& ptr) { return reinterpret_cast<const T&>(ptr); } template <typename T, typename U> -CMT_INLINE constexpr static T* ptr_cast(U* ptr) +CMT_INTRINSIC constexpr static T* ptr_cast(U* ptr) { return reinterpret_cast<T*>(ptr); } template <typename T, typename U> -CMT_INLINE constexpr static const T* ptr_cast(const U* ptr) +CMT_INTRINSIC constexpr static const T* ptr_cast(const U* ptr) { return reinterpret_cast<const T*>(ptr); } template <typename T, typename U> -CMT_INLINE constexpr static T* ptr_cast(U* ptr, ptrdiff_t offset) +CMT_INTRINSIC constexpr static T* ptr_cast(U* ptr, ptrdiff_t offset) { return ptr_cast<T>(ptr_cast<unsigned char>(ptr) + offset); } template <typename T, typename U> -CMT_INLINE constexpr static T* derived_cast(U* ptr) +CMT_INTRINSIC constexpr static T* derived_cast(U* ptr) { return static_cast<T*>(ptr); } template <typename T, typename U> -CMT_INLINE constexpr static const T* derived_cast(const U* ptr) +CMT_INTRINSIC constexpr static const T* derived_cast(const U* ptr) { return static_cast<const T*>(ptr); } template <typename T, typename U> -CMT_INLINE constexpr static T implicit_cast(U&& value) +CMT_INTRINSIC constexpr static T implicit_cast(U&& value) { return std::forward<T>(value); } @@ -1751,6 +1847,228 @@ constexpr conditional<std::is_scalar<T>::value, T, const T&> const_min(const T& return x < y ? x : y; } +template <int n = 10> +struct overload_priority : overload_priority<n - 1> +{ +}; + +template <> +struct overload_priority<0> +{ +}; + +constexpr overload_priority<> overload_auto{}; + +using overload_generic = overload_priority<0>; + +#define CMT_GEN_LIST1(m, ...) m(0, __VA_ARGS__) +#define CMT_GEN_LIST2(m, ...) CMT_GEN_LIST1(m, __VA_ARGS__), m(1, __VA_ARGS__) +#define CMT_GEN_LIST3(m, ...) CMT_GEN_LIST2(m, __VA_ARGS__), m(2, __VA_ARGS__) +#define CMT_GEN_LIST4(m, ...) CMT_GEN_LIST3(m, __VA_ARGS__), m(3, __VA_ARGS__) +#define CMT_GEN_LIST5(m, ...) CMT_GEN_LIST4(m, __VA_ARGS__), m(4, __VA_ARGS__) +#define CMT_GEN_LIST6(m, ...) CMT_GEN_LIST5(m, __VA_ARGS__), m(5, __VA_ARGS__) +#define CMT_GEN_LIST7(m, ...) CMT_GEN_LIST6(m, __VA_ARGS__), m(6, __VA_ARGS__) +#define CMT_GEN_LIST8(m, ...) CMT_GEN_LIST7(m, __VA_ARGS__), m(7, __VA_ARGS__) +#define CMT_GEN_LIST9(m, ...) CMT_GEN_LIST8(m, __VA_ARGS__), m(8, __VA_ARGS__) +#define CMT_GEN_LIST10(m, ...) CMT_GEN_LIST9(m, __VA_ARGS__), m(9, __VA_ARGS__) + +#define CMT_GEN_LIST11(m, ...) CMT_GEN_LIST10(m, __VA_ARGS__), m(10, __VA_ARGS__) +#define CMT_GEN_LIST12(m, ...) CMT_GEN_LIST11(m, __VA_ARGS__), m(11, __VA_ARGS__) +#define CMT_GEN_LIST13(m, ...) CMT_GEN_LIST12(m, __VA_ARGS__), m(12, __VA_ARGS__) +#define CMT_GEN_LIST14(m, ...) CMT_GEN_LIST13(m, __VA_ARGS__), m(13, __VA_ARGS__) +#define CMT_GEN_LIST15(m, ...) CMT_GEN_LIST14(m, __VA_ARGS__), m(14, __VA_ARGS__) +#define CMT_GEN_LIST16(m, ...) CMT_GEN_LIST15(m, __VA_ARGS__), m(15, __VA_ARGS__) +#define CMT_GEN_LIST17(m, ...) CMT_GEN_LIST16(m, __VA_ARGS__), m(16, __VA_ARGS__) +#define CMT_GEN_LIST18(m, ...) CMT_GEN_LIST17(m, __VA_ARGS__), m(17, __VA_ARGS__) +#define CMT_GEN_LIST19(m, ...) CMT_GEN_LIST18(m, __VA_ARGS__), m(18, __VA_ARGS__) +#define CMT_GEN_LIST20(m, ...) CMT_GEN_LIST19(m, __VA_ARGS__), m(19, __VA_ARGS__) + +#define CMT_GEN_LIST21(m, ...) CMT_GEN_LIST20(m, __VA_ARGS__), m(20, __VA_ARGS__) +#define CMT_GEN_LIST22(m, ...) CMT_GEN_LIST21(m, __VA_ARGS__), m(21, __VA_ARGS__) +#define CMT_GEN_LIST23(m, ...) CMT_GEN_LIST22(m, __VA_ARGS__), m(22, __VA_ARGS__) +#define CMT_GEN_LIST24(m, ...) CMT_GEN_LIST23(m, __VA_ARGS__), m(23, __VA_ARGS__) +#define CMT_GEN_LIST25(m, ...) CMT_GEN_LIST24(m, __VA_ARGS__), m(24, __VA_ARGS__) +#define CMT_GEN_LIST26(m, ...) CMT_GEN_LIST25(m, __VA_ARGS__), m(25, __VA_ARGS__) +#define CMT_GEN_LIST27(m, ...) CMT_GEN_LIST26(m, __VA_ARGS__), m(26, __VA_ARGS__) +#define CMT_GEN_LIST28(m, ...) CMT_GEN_LIST27(m, __VA_ARGS__), m(27, __VA_ARGS__) +#define CMT_GEN_LIST29(m, ...) CMT_GEN_LIST28(m, __VA_ARGS__), m(28, __VA_ARGS__) +#define CMT_GEN_LIST30(m, ...) CMT_GEN_LIST29(m, __VA_ARGS__), m(29, __VA_ARGS__) + +#define CMT_GEN_LIST31(m, ...) CMT_GEN_LIST30(m, __VA_ARGS__), m(30, __VA_ARGS__) +#define CMT_GEN_LIST32(m, ...) CMT_GEN_LIST31(m, __VA_ARGS__), m(31, __VA_ARGS__) +#define CMT_GEN_LIST33(m, ...) CMT_GEN_LIST32(m, __VA_ARGS__), m(32, __VA_ARGS__) +#define CMT_GEN_LIST34(m, ...) CMT_GEN_LIST33(m, __VA_ARGS__), m(33, __VA_ARGS__) +#define CMT_GEN_LIST35(m, ...) CMT_GEN_LIST34(m, __VA_ARGS__), m(34, __VA_ARGS__) +#define CMT_GEN_LIST36(m, ...) CMT_GEN_LIST35(m, __VA_ARGS__), m(35, __VA_ARGS__) +#define CMT_GEN_LIST37(m, ...) CMT_GEN_LIST36(m, __VA_ARGS__), m(36, __VA_ARGS__) +#define CMT_GEN_LIST38(m, ...) CMT_GEN_LIST37(m, __VA_ARGS__), m(37, __VA_ARGS__) +#define CMT_GEN_LIST39(m, ...) CMT_GEN_LIST38(m, __VA_ARGS__), m(38, __VA_ARGS__) +#define CMT_GEN_LIST40(m, ...) CMT_GEN_LIST39(m, __VA_ARGS__), m(39, __VA_ARGS__) + +#define CMT_GEN_LIST41(m, ...) CMT_GEN_LIST40(m, __VA_ARGS__), m(40, __VA_ARGS__) +#define CMT_GEN_LIST42(m, ...) CMT_GEN_LIST41(m, __VA_ARGS__), m(41, __VA_ARGS__) +#define CMT_GEN_LIST43(m, ...) CMT_GEN_LIST42(m, __VA_ARGS__), m(42, __VA_ARGS__) +#define CMT_GEN_LIST44(m, ...) CMT_GEN_LIST43(m, __VA_ARGS__), m(43, __VA_ARGS__) +#define CMT_GEN_LIST45(m, ...) CMT_GEN_LIST44(m, __VA_ARGS__), m(44, __VA_ARGS__) +#define CMT_GEN_LIST46(m, ...) CMT_GEN_LIST45(m, __VA_ARGS__), m(45, __VA_ARGS__) +#define CMT_GEN_LIST47(m, ...) CMT_GEN_LIST46(m, __VA_ARGS__), m(46, __VA_ARGS__) +#define CMT_GEN_LIST48(m, ...) CMT_GEN_LIST47(m, __VA_ARGS__), m(47, __VA_ARGS__) +#define CMT_GEN_LIST49(m, ...) CMT_GEN_LIST48(m, __VA_ARGS__), m(48, __VA_ARGS__) +#define CMT_GEN_LIST50(m, ...) CMT_GEN_LIST49(m, __VA_ARGS__), m(49, __VA_ARGS__) + +#define CMT_GEN_LIST51(m, ...) CMT_GEN_LIST50(m, __VA_ARGS__), m(50, __VA_ARGS__) +#define CMT_GEN_LIST52(m, ...) CMT_GEN_LIST51(m, __VA_ARGS__), m(51, __VA_ARGS__) +#define CMT_GEN_LIST53(m, ...) CMT_GEN_LIST52(m, __VA_ARGS__), m(52, __VA_ARGS__) +#define CMT_GEN_LIST54(m, ...) CMT_GEN_LIST53(m, __VA_ARGS__), m(53, __VA_ARGS__) +#define CMT_GEN_LIST55(m, ...) CMT_GEN_LIST54(m, __VA_ARGS__), m(54, __VA_ARGS__) +#define CMT_GEN_LIST56(m, ...) CMT_GEN_LIST55(m, __VA_ARGS__), m(55, __VA_ARGS__) +#define CMT_GEN_LIST57(m, ...) CMT_GEN_LIST56(m, __VA_ARGS__), m(56, __VA_ARGS__) +#define CMT_GEN_LIST58(m, ...) CMT_GEN_LIST57(m, __VA_ARGS__), m(57, __VA_ARGS__) +#define CMT_GEN_LIST59(m, ...) CMT_GEN_LIST58(m, __VA_ARGS__), m(58, __VA_ARGS__) +#define CMT_GEN_LIST60(m, ...) CMT_GEN_LIST59(m, __VA_ARGS__), m(59, __VA_ARGS__) + +#define CMT_GEN_LIST61(m, ...) CMT_GEN_LIST60(m, __VA_ARGS__), m(60, __VA_ARGS__) +#define CMT_GEN_LIST62(m, ...) CMT_GEN_LIST61(m, __VA_ARGS__), m(61, __VA_ARGS__) +#define CMT_GEN_LIST63(m, ...) CMT_GEN_LIST62(m, __VA_ARGS__), m(62, __VA_ARGS__) +#define CMT_GEN_LIST64(m, ...) CMT_GEN_LIST63(m, __VA_ARGS__), m(63, __VA_ARGS__) +#define CMT_GEN_LIST65(m, ...) CMT_GEN_LIST64(m, __VA_ARGS__), m(64, __VA_ARGS__) +#define CMT_GEN_LIST66(m, ...) CMT_GEN_LIST65(m, __VA_ARGS__), m(65, __VA_ARGS__) +#define CMT_GEN_LIST67(m, ...) CMT_GEN_LIST66(m, __VA_ARGS__), m(66, __VA_ARGS__) +#define CMT_GEN_LIST68(m, ...) CMT_GEN_LIST67(m, __VA_ARGS__), m(67, __VA_ARGS__) +#define CMT_GEN_LIST69(m, ...) CMT_GEN_LIST68(m, __VA_ARGS__), m(68, __VA_ARGS__) +#define CMT_GEN_LIST70(m, ...) CMT_GEN_LIST69(m, __VA_ARGS__), m(69, __VA_ARGS__) + +#define CMT_GEN_LIST(c, m, ...) CMT_GEN_LIST##c(m, __VA_ARGS__) + +template <typename Tout, typename Tin> +constexpr CMT_INLINE Tout bitcast_anything(const Tin& in) +{ + static_assert(sizeof(Tin) == sizeof(Tout), "Invalid arguments for bitcast_anything"); +#ifdef CMT_COMPILER_INTEL + const union { + const Tin in; + Tout out; + } u{ in }; + return u.out; +#else + union { + Tin in; + Tout out; + } u{ in }; + return u.out; +#endif +} + +template <typename T> +constexpr T dont_deduce(T x) +{ + return x; +} + +template <typename Ty, typename T> +constexpr T just_value(T value) +{ + return value; +} + +enum class special_constant +{ + undefined, + default_constructed, + infinity, + neg_infinity, + min, + max, + neg_max, + lowest, + epsilon, + integer, + floating_point, + random_bits, +}; + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4700)) +CMT_PRAGMA_MSVC(warning(disable : 4146)) +struct special_value +{ + constexpr special_value(const special_value&) = default; + constexpr special_value(special_constant c) : c(c), ll(0), d(0) {} + constexpr special_value(double d) : c(special_constant::floating_point), ll(0), d(d) {} + constexpr special_value(long long ll) : c(special_constant::integer), ll(ll), d(0) {} + constexpr special_value(int i) : c(special_constant::integer), ll(i), d(0) {} + + template <typename T> + constexpr T get() const CMT_NOEXCEPT + { + switch (c) + { + CMT_PRAGMA_GNU(GCC diagnostic push) + CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wuninitialized") + CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized") + case special_constant::undefined: + T undef; + return undef; + CMT_PRAGMA_GNU(GCC diagnostic pop) + case special_constant::default_constructed: + return T{}; + case special_constant::infinity: + return std::numeric_limits<subtype<T>>::infinity(); + case special_constant::neg_infinity: + { + subtype<T> gg = std::numeric_limits<subtype<T>>::infinity(); + return -gg; + } + case special_constant::min: + return std::numeric_limits<subtype<T>>::min(); + case special_constant::max: + return std::numeric_limits<subtype<T>>::max(); + case special_constant::neg_max: + return static_cast<T>(-std::numeric_limits<subtype<T>>::max()); + case special_constant::lowest: + return std::numeric_limits<subtype<T>>::lowest(); + case special_constant::integer: + return static_cast<T>(ll); + case special_constant::floating_point: + return static_cast<T>(d); + case special_constant::random_bits: + return random_bits<T>(); + default: + return T{}; + } + } + + template <typename T> + constexpr operator T() const CMT_NOEXCEPT + { + return get<T>(); + } + special_constant c; + long long ll; + double d; + + static std::mt19937& random_generator() + { + static std::mt19937 rnd(1); + return rnd; + } + + template <typename T> + static T random_bits() + { + union { + uint32_t bits[(sizeof(T) + sizeof(uint32_t) - 1) / sizeof(uint32_t)]; + T value; + } u; + for (uint32_t& b : u.bits) + { + b = random_generator()(); + } + return u.value; + } +}; +CMT_PRAGMA_MSVC(warning(pop)) + CMT_PRAGMA_GNU(GCC diagnostic pop) } // namespace cometa diff --git a/include/kfr/cometa/array.hpp b/include/kfr/cometa/array.hpp @@ -28,31 +28,32 @@ public: using size_type = std::size_t; using difference_type = std::ptrdiff_t; - constexpr array_ref() noexcept : m_data(nullptr), m_size(0) {} - constexpr array_ref(const array_ref&) noexcept = default; - constexpr array_ref(array_ref&&) noexcept = default; + constexpr array_ref() CMT_NOEXCEPT : m_data(nullptr), m_size(0) {} + constexpr array_ref(const array_ref&) CMT_NOEXCEPT = default; + constexpr array_ref(array_ref&&) CMT_NOEXCEPT = default; #ifdef CMT_COMPILER_GNU - constexpr array_ref& operator=(const array_ref&) noexcept = default; - constexpr array_ref& operator=(array_ref&&) noexcept = default; + constexpr array_ref& operator=(const array_ref&) CMT_NOEXCEPT = default; + constexpr array_ref& operator=(array_ref&&) CMT_NOEXCEPT = default; #else array_ref& operator=(const array_ref&) = default; array_ref& operator=(array_ref&&) = default; #endif template <size_t N> - constexpr array_ref(value_type (&arr)[N]) noexcept : m_data(arr), m_size(N) + constexpr array_ref(value_type (&arr)[N]) CMT_NOEXCEPT : m_data(arr), m_size(N) { } template <size_t N> - constexpr array_ref(const std::array<T, N>& arr) noexcept : m_data(arr.data()), m_size(N) + constexpr array_ref(const std::array<T, N>& arr) CMT_NOEXCEPT : m_data(arr.data()), m_size(N) { } template <size_t N> - constexpr array_ref(std::array<T, N>& arr) noexcept : m_data(arr.data()), m_size(N) + constexpr array_ref(std::array<T, N>& arr) CMT_NOEXCEPT : m_data(arr.data()), m_size(N) { } template <typename Alloc> - constexpr array_ref(const std::vector<T, Alloc>& vec) noexcept : m_data(vec.data()), m_size(vec.size()) + constexpr array_ref(const std::vector<T, Alloc>& vec) CMT_NOEXCEPT : m_data(vec.data()), + m_size(vec.size()) { } @@ -61,26 +62,26 @@ public: { } - constexpr array_ref(const std::initializer_list<T>& vec) noexcept - : m_data(vec.begin()), m_size(vec.size()) + constexpr array_ref(const std::initializer_list<T>& vec) CMT_NOEXCEPT : m_data(vec.begin()), + m_size(vec.size()) { } template <typename InputIter> - constexpr array_ref(InputIter first, InputIter last) noexcept - : m_data(std::addressof(*first)), m_size(std::distance(first, last)) + constexpr array_ref(InputIter first, InputIter last) CMT_NOEXCEPT : m_data(std::addressof(*first)), + m_size(std::distance(first, last)) { } - constexpr array_ref(T* data, size_type size) noexcept : m_data(data), m_size(size) {} - - constexpr reference front() const noexcept { return m_data[0]; } - constexpr reference back() const noexcept { return m_data[m_size - 1]; } - constexpr iterator begin() const noexcept { return m_data; } - constexpr iterator end() const noexcept { return m_data + m_size; } - constexpr const_iterator cbegin() const noexcept { return m_data; } - constexpr const_iterator cend() const noexcept { return m_data + m_size; } - constexpr pointer data() const noexcept { return m_data; } - constexpr std::size_t size() const noexcept { return m_size; } - constexpr bool empty() const noexcept { return !m_size; } + constexpr array_ref(T* data, size_type size) CMT_NOEXCEPT : m_data(data), m_size(size) {} + + constexpr reference front() const CMT_NOEXCEPT { return m_data[0]; } + constexpr reference back() const CMT_NOEXCEPT { return m_data[m_size - 1]; } + constexpr iterator begin() const CMT_NOEXCEPT { return m_data; } + constexpr iterator end() const CMT_NOEXCEPT { return m_data + m_size; } + constexpr const_iterator cbegin() const CMT_NOEXCEPT { return m_data; } + constexpr const_iterator cend() const CMT_NOEXCEPT { return m_data + m_size; } + constexpr pointer data() const CMT_NOEXCEPT { return m_data; } + constexpr std::size_t size() const CMT_NOEXCEPT { return m_size; } + constexpr bool empty() const CMT_NOEXCEPT { return !m_size; } constexpr reference operator[](std::size_t index) const { return m_data[index]; } private: @@ -126,22 +127,22 @@ inline array_ref<const T> make_array_ref(const std::vector<T>& cont) } template <typename C> -constexpr auto datatype(C& c) +constexpr auto elementtype(C& c) { return c[0]; } template <typename C> -constexpr auto datatype(const C& c) +constexpr auto elementtype(const C& c) { return c[0]; } template <typename E> -constexpr E datatype(const std::initializer_list<E>& il) +constexpr E elementtype(const std::initializer_list<E>&) { return {}; } template <typename T, std::size_t N> -constexpr T datatype(T (&array)[N]) +constexpr T elementtype(T (&)[N]) { return {}; } @@ -157,17 +158,17 @@ constexpr auto data(const C& c) -> decltype(c.data()) return c.data(); } template <typename T, std::size_t N> -constexpr T* data(T (&array)[N]) noexcept +constexpr T* data(T (&array)[N]) CMT_NOEXCEPT { return array; } template <typename T> -constexpr T* data(T* array) noexcept +constexpr T* data(T* array) CMT_NOEXCEPT { return array; } template <typename E> -constexpr const E* data(const std::initializer_list<E>& il) noexcept +constexpr const E* data(const std::initializer_list<E>& il) CMT_NOEXCEPT { return il.begin(); } @@ -178,7 +179,7 @@ constexpr auto size(const C& c) -> decltype(c.size()) return c.size(); } template <typename T, std::size_t N> -constexpr std::size_t size(const T (&array)[N]) noexcept +constexpr std::size_t size(const T (&)[N]) CMT_NOEXCEPT { return N; } diff --git a/include/kfr/cometa/cstring.hpp b/include/kfr/cometa/cstring.hpp @@ -24,48 +24,48 @@ struct cstring using value_type = char; using size_type = size_t; - constexpr const value_type* c_str() const noexcept { return value; } - constexpr const value_type* data() const noexcept { return value; } + constexpr const value_type* c_str() const CMT_NOEXCEPT { return value; } + constexpr const value_type* data() const CMT_NOEXCEPT { return value; } const value_type value[N]; - constexpr size_type length() const noexcept { return N - 1; } - constexpr size_type size() const noexcept { return N; } + constexpr size_type length() const CMT_NOEXCEPT { return N - 1; } + constexpr size_type size() const CMT_NOEXCEPT { return N; } template <size_t start, size_t count> - constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const noexcept + constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const CMT_NOEXCEPT { - return slice_impl(csizeseq_t<count, start>()); + return slice_impl(csizeseq<count, start>); } template <size_t start> - constexpr cstring<N - start> slice(csize_t<start>) const noexcept + constexpr cstring<N - start> slice(csize_t<start>) const CMT_NOEXCEPT { - return slice_impl(csizeseq_t<N - 1 - start, start>()); + return slice_impl(csizeseq<N - 1 - start, start>); } - constexpr friend bool operator==(const cstring& left, const cstring& right) noexcept + constexpr friend bool operator==(const cstring& left, const cstring& right) CMT_NOEXCEPT { for (size_t i = 0; i < 1; i++) if (left.value[i] != right.value[i]) return false; return true; } - constexpr friend bool operator!=(const cstring& left, const cstring& right) noexcept + constexpr friend bool operator!=(const cstring& left, const cstring& right) CMT_NOEXCEPT { return !(left == right); } template <size_t NN> - constexpr bool operator==(const cstring<NN>& other) const noexcept + constexpr bool operator==(const cstring<NN>&) const CMT_NOEXCEPT { return false; } template <size_t NN> - constexpr bool operator!=(const cstring<NN>& other) const noexcept + constexpr bool operator!=(const cstring<NN>&) const CMT_NOEXCEPT { return true; } - constexpr char operator[](size_t index) const noexcept { return value[index]; } + constexpr char operator[](size_t index) const CMT_NOEXCEPT { return value[index]; } private: template <size_t... indices> @@ -98,9 +98,9 @@ CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring< return concat_str_impl(str1, str2, cvalseq_t<size_t, N1 - 1 + N2 - 1>()); } template <size_t N1, size_t Nfrom, size_t Nto, size_t... indices> -CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str, - const cstring<Nfrom>&, const cstring<Nto>& to, - csizes_t<indices...>) +CMT_INTRINSIC cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str, + const cstring<Nfrom>&, const cstring<Nto>& to, + csizes_t<indices...>) { if (pos == size_t(-1)) stop_constexpr(); @@ -111,35 +111,35 @@ CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring< } } // namespace details -CMT_INTRIN constexpr cstring<1> concat_cstring() { return { { 0 } }; } +CMT_INTRINSIC constexpr cstring<1> concat_cstring() { return { { 0 } }; } template <size_t N1> -CMT_INTRIN constexpr cstring<N1> concat_cstring(const cstring<N1>& str1) +CMT_INTRINSIC constexpr cstring<N1> concat_cstring(const cstring<N1>& str1) { return str1; } template <size_t N1, size_t N2, typename... Args> -CMT_INTRIN constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2, - const Args&... args) +CMT_INTRINSIC constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2, + const Args&... args) { return details::concat_str_impl(str1, concat_cstring(str2, args...)); } template <size_t N> -CMT_INTRIN constexpr cstring<N> make_cstring(const char (&str)[N]) +CMT_INTRINSIC constexpr cstring<N> make_cstring(const char (&str)[N]) { return details::make_cstring_impl(str, cvalseq_t<size_t, N - 1>()); } template <char... chars> -CMT_INTRIN constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>) +CMT_INTRINSIC constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>) { return { { chars..., 0 } }; } template <size_t N1, size_t Nneedle> -CMT_INTRIN size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needle) +CMT_INTRINSIC size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needle) { size_t count = 0; for (size_t i = 0; i < N1; i++) @@ -155,8 +155,8 @@ CMT_INTRIN size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needl } template <size_t N1, size_t Nfrom, size_t Nto> -CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from, - const cstring<Nto>& to) +CMT_INTRINSIC cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from, + const cstring<Nto>& to) { return details::str_replace_impl(str_find(str, from), str, from, to, cvalseq_t<size_t, N1 - Nfrom + Nto - 1>()); diff --git a/include/kfr/cometa/ctti.hpp b/include/kfr/cometa/ctti.hpp @@ -12,7 +12,7 @@ using pconstvoid = const void*; struct type_id_t { - constexpr type_id_t(const void* id) noexcept : id(id) {} + constexpr type_id_t(const void* id) CMT_NOEXCEPT : id(id) {} constexpr bool operator==(type_id_t other) const { return id == other.id; } constexpr bool operator!=(type_id_t other) const { return !(id == other.id); } const void* const id; @@ -22,7 +22,7 @@ namespace details { template <typename T> -constexpr inline type_id_t typeident_impl() noexcept +constexpr inline type_id_t typeident_impl() CMT_NOEXCEPT { return type_id_t(pconstvoid(&typeident_impl<T>)); } @@ -30,21 +30,32 @@ constexpr inline type_id_t typeident_impl() noexcept #ifdef CMT_COMPILER_CLANG constexpr size_t typename_prefix = sizeof("auto cometa::ctype_name() [T = ") - 1; constexpr size_t typename_postfix = sizeof("]") - 1; +#elif CMT_COMPILER_MSVC +constexpr size_t typename_prefix = sizeof("auto __cdecl cometa::ctype_name<") - 1; +constexpr size_t typename_postfix = sizeof(">(void) noexcept") - 1; #else constexpr size_t typename_prefix = sizeof("constexpr auto cometa::ctype_name() [with T = ") - 1; constexpr size_t typename_postfix = sizeof("]") - 1; #endif template <size_t... indices, size_t Nout = 1 + sizeof...(indices)> -constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) noexcept +constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) CMT_NOEXCEPT { return cstring<Nout>{ { (str[indices])..., 0 } }; } } // namespace details +#ifdef CMT_COMPILER_MSVC +#define KFR_CALL_CONV_SPEC __cdecl +#else +#define KFR_CALL_CONV_SPEC +#endif + template <typename T> -constexpr auto ctype_name() noexcept +constexpr auto KFR_CALL_CONV_SPEC ctype_name() CMT_NOEXCEPT { + static_assert(details::typename_prefix + details::typename_postfix + 1 <= sizeof(CMT_FUNC_SIGNATURE) - 1, + "Incorrect details::typename_prefix or details::typename_postfix"); return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix, csizeseq_t<(sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix - details::typename_postfix)>()); @@ -57,7 +68,7 @@ constexpr auto ctype_name() noexcept * @return name of the type */ template <typename T> -inline const char* type_name() noexcept +inline const char* type_name() CMT_NOEXCEPT { static const auto name = ctype_name<T>(); return name.c_str(); @@ -70,7 +81,7 @@ inline const char* type_name() noexcept * @return name of the type */ template <typename T> -inline const char* type_name(T x) noexcept +inline const char* type_name(T x) CMT_NOEXCEPT { (void)x; return type_name<T>(); diff --git a/include/kfr/cometa/function.hpp b/include/kfr/cometa/function.hpp @@ -16,20 +16,20 @@ struct virtual_function { virtual Result operator()(Args... args) = 0; virtual virtual_function* make_copy() const = 0; - CMT_INTRIN virtual ~virtual_function() = default; + virtual ~virtual_function() = default; }; template <typename Fn, typename Result, typename... Args> struct virtual_function_impl : virtual_function<Result, Args...> { public: - CMT_INTRIN virtual_function_impl(const Fn& fn) : fn(fn) {} - CMT_INTRIN Result operator()(Args... args) override final { return fn(args...); } - CMT_INTRIN virtual_function<Result, Args...>* make_copy() const override final + CMT_MEM_INTRINSIC virtual_function_impl(const Fn& fn) : fn(fn) {} + CMT_MEM_INTRINSIC Result operator()(Args... args) final { return fn(args...); } + CMT_MEM_INTRINSIC virtual_function<Result, Args...>* make_copy() const final { return new virtual_function_impl{ fn }; } - CMT_INTRIN ~virtual_function_impl() {} + CMT_MEM_INTRINSIC ~virtual_function_impl() {} private: Fn fn; @@ -47,13 +47,13 @@ struct func_filter<Result(Args...)> }; template <typename T> -constexpr CMT_INTRIN T return_val() noexcept +constexpr CMT_INTRINSIC T return_val() CMT_NOEXCEPT { return {}; } template <> -constexpr CMT_INTRIN void return_val<void>() noexcept +constexpr CMT_INTRINSIC void return_val<void>() CMT_NOEXCEPT { } } // namespace details @@ -81,16 +81,16 @@ struct function<Result(Args...)> return *this; } - CMT_INTRIN function() : fn(nullptr) {} - CMT_INTRIN function(std::nullptr_t) : fn(nullptr) {} + CMT_MEM_INTRINSIC function() : fn(nullptr) {} + CMT_MEM_INTRINSIC function(std::nullptr_t) : fn(nullptr) {} template <typename Func> - CMT_INTRIN function(const Func& x) + CMT_MEM_INTRINSIC function(const Func& x) : fn(new details::virtual_function_impl<typename details::func_filter<Func>::type, Result, Args...>( x)) { } function(const this_t& other) : fn(other.fn ? other.fn->make_copy() : nullptr) {} - CMT_INTRIN function& operator=(const this_t& other) + CMT_MEM_INTRINSIC function& operator=(const this_t& other) { if ((&other != this) && (other.fn)) { @@ -100,14 +100,14 @@ struct function<Result(Args...)> } return *this; } - CMT_INTRIN function& operator=(std::nullptr_t) + CMT_MEM_INTRINSIC function& operator=(std::nullptr_t) { delete fn; fn = nullptr; return *this; } template <typename Fn> - CMT_INTRIN function& operator=(const Fn& x) + CMT_MEM_INTRINSIC function& operator=(const Fn& x) { using FnImpl = details::virtual_function_impl<typename details::func_filter<Fn>::type, Result, Args...>; @@ -116,15 +116,15 @@ struct function<Result(Args...)> fn = temp; return *this; } - CMT_INTRIN Result operator()(Args... args) const { return (*fn)(std::forward<Args>(args)...); } + CMT_MEM_INTRINSIC Result operator()(Args... args) const { return (*fn)(std::forward<Args>(args)...); } template <typename TResult> - CMT_INTRIN Result call(TResult&& default_result, Args... args) const + CMT_MEM_INTRINSIC Result call(TResult&& default_result, Args... args) const { return fn ? (*fn)(std::forward<Args>(args)...) : std::forward<TResult>(default_result); } - CMT_INTRIN explicit operator bool() const noexcept { return !!fn; } + CMT_MEM_INTRINSIC explicit operator bool() const CMT_NOEXCEPT { return !!fn; } - CMT_INTRIN ~function() { delete fn; } + CMT_MEM_INTRINSIC ~function() { delete fn; } private: details::virtual_function<Result, Args...>* fn; diff --git a/include/kfr/cometa/named_arg.hpp b/include/kfr/cometa/named_arg.hpp @@ -19,10 +19,10 @@ struct named_arg struct named { - constexpr named(const char* name) noexcept : name(name) {} + constexpr named(const char* name) CMT_NOEXCEPT : name(name) {} template <typename T> - CMT_INTRIN constexpr named_arg<T> operator=(T&& value) + CMT_MEM_INTRINSIC constexpr named_arg<T> operator=(T&& value) { return named_arg<T>{ std::forward<T>(value), name }; } diff --git a/include/kfr/cometa/numeric.hpp b/include/kfr/cometa/numeric.hpp @@ -0,0 +1,194 @@ +/** @addtogroup cometa + * @{ + */ +#pragma once + +#include "../cometa.hpp" + +namespace cometa +{ + +/// @brief Short names for common types +using b8 = bool; +using f32 = float; +using f64 = double; +using i8 = int8_t; +using i16 = int16_t; +using i32 = int32_t; +using i64 = int64_t; +using u8 = uint8_t; +using u16 = uint16_t; +using u32 = uint32_t; +using u64 = uint64_t; +using umax = uint64_t; +using imax = int64_t; +using fmax = double; +using f80 = long double; + +#if defined(CMT_BASETYPE_F32) || defined(CMT_NO_NATIVE_F64) +using fbase = float; +#else +using fbase = double; +#endif + +namespace details +{ +template <typename T> +struct fix_type_impl +{ + using type = T; +}; + +template <> +struct fix_type_impl<char> +{ + using type = i8; +}; + +template <> +struct fix_type_impl<unsigned long> +{ +#if ULONG_MAX == ULLONG_MAX + using type = u64; +#else + using type = u32; +#endif +}; + +template <> +struct fix_type_impl<signed long> +{ +#if LONG_MAX == LLONG_MAX + using type = i64; +#else + using type = i32; +#endif +}; + +template <> +struct fix_type_impl<unsigned long long> +{ + using type = u64; +}; + +template <> +struct fix_type_impl<signed long long> +{ + using type = i64; +}; + +} // namespace details + +template <typename T> +using fix_type = typename details::fix_type_impl<T>::type; + +/// @brief An enumeration representing data type +enum class datatype : int +{ + typebits_mask = 0xFF, + f = 0x100, // floating point + i = 0x200, // signed integer + u = 0x300, // unsigned integer + c = 0x400, // complex floating point + b = 0x500, // boolean + typeclass_mask = 0xF00, + f16 = static_cast<int>(f) | 16, + f32 = static_cast<int>(f) | 32, + f64 = static_cast<int>(f) | 64, + f80 = static_cast<int>(f) | 80, + i8 = static_cast<int>(i) | 8, + i16 = static_cast<int>(i) | 16, + i24 = static_cast<int>(i) | 24, + i32 = static_cast<int>(i) | 32, + i64 = static_cast<int>(i) | 64, + u8 = static_cast<int>(u) | 8, + u16 = static_cast<int>(u) | 16, + u24 = static_cast<int>(u) | 24, + u32 = static_cast<int>(u) | 32, + u64 = static_cast<int>(u) | 64, + c32 = static_cast<int>(c) | 32, + c64 = static_cast<int>(c) | 64, + b8 = static_cast<int>(b) | 8 +}; + +constexpr inline datatype operator|(datatype x, datatype y) +{ + using type = underlying_type<datatype>; + return static_cast<datatype>(static_cast<type>(x) | static_cast<type>(y)); +} + +constexpr inline datatype operator&(datatype x, datatype y) +{ + using type = underlying_type<datatype>; + return static_cast<datatype>(static_cast<type>(x) & static_cast<type>(y)); +} + +template <typename T> +constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value + ? datatype::f + : std::is_integral<typename compound_type_traits<T>::subtype>::value + ? (std::is_unsigned<typename compound_type_traits<T>::subtype>::value + ? datatype::u + : datatype::i) + : datatype(); + +template <typename T> +using is_f_class = std::integral_constant<bool, typeclass<T> == datatype::f>; +template <typename T> +using is_u_class = std::integral_constant<bool, typeclass<T> == datatype::u>; +template <typename T> +using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>; + +template <typename T> +struct typebits +{ + static_assert(is_number<deep_subtype<T>>::value, ""); + constexpr static size_t bits = sizeof(typename compound_type_traits<T>::subtype) * 8; + constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width; + using subtype = typename compound_type_traits<T>::subtype; +}; + +template <typename T> +using ftype = + typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>; +template <typename T> +using itype = + typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>; +template <typename T> +using utype = + typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>; + +template <typename T> +using uitype = conditional<is_i_class<deep_subtype<T>>::value, T, utype<T>>; + +template <typename T> +using fsubtype = ftype<subtype<T>>; +template <typename T> +using isubtype = itype<subtype<T>>; +template <typename T> +using usubtype = utype<subtype<T>>; +namespace details +{ +template <typename T> +struct flt_type_impl +{ + using type = conditional<sizeof(T) <= 2, float, fbase>; +}; + +template <> +struct flt_type_impl<float> +{ + using type = float; +}; +template <> +struct flt_type_impl<double> +{ + using type = double; +}; +} // namespace details + +template <typename T> +using flt_type = typename cometa::compound_type_traits<T>::template deep_rebind< + typename details::flt_type_impl<deep_subtype<T>>::type>; + +} // namespace cometa diff --git a/include/kfr/cometa/range.hpp b/include/kfr/cometa/range.hpp @@ -19,8 +19,9 @@ struct range using const_pointer = const T*; using diff_type = decltype(std::declval<T>() - std::declval<T>()); - constexpr range(value_type begin, value_type end, diff_type step) noexcept - : value_begin(begin), value_end(end), step(step) + constexpr range(value_type begin, value_type end, diff_type step) CMT_NOEXCEPT : min(begin), + max(end), + step(step) { } @@ -28,42 +29,44 @@ struct range { value_type value; diff_type step; - const_reference operator*() const { return value; } - const_pointer operator->() const { return &value; } - iterator& operator++() + constexpr const_reference operator*() const { return value; } + constexpr const_pointer operator->() const { return &value; } + constexpr iterator& operator++() { value += step; return *this; } - iterator operator++(int) + constexpr iterator operator++(int) { iterator copy = *this; ++(*this); return copy; } - bool operator!=(const iterator& other) const + constexpr bool operator!=(const iterator& other) const { return step > 0 ? value < other.value : value > other.value; } }; - value_type value_begin; - value_type value_end; + value_type min; + value_type max; diff_type step; - iterator begin() const { return iterator{ value_begin, step }; } - iterator end() const { return iterator{ value_end, step }; } + constexpr iterator begin() const { return iterator{ min, step }; } + constexpr iterator end() const { return iterator{ max, step }; } + + constexpr T distance() const { return max - min; } }; /// @brief Make iterable range object template <typename T> -range<T> make_range(T begin, T end) +constexpr range<T> make_range(T begin, T end) { return range<T>(begin, end, end > begin ? 1 : -1); } /// @brief Make iterable range object with step -template <typename T, typename diff_type = decltype(std::declval<T>() - std::declval<T>())> -range<T> make_range(T begin, T end, diff_type step) +template <typename T, typename D> +constexpr range<std::common_type_t<T, D>> make_range(T begin, T end, D step) { - return range<T>(begin, end, step); + return range<std::common_type_t<T, D>>(begin, end, step); } } // namespace cometa diff --git a/include/kfr/cometa/result.hpp b/include/kfr/cometa/result.hpp @@ -20,18 +20,19 @@ struct result constexpr static error_type ok_value = OkValue; - constexpr result(const result&) = default; - constexpr result(result&&) noexcept = default; + constexpr result(const result&) = default; + constexpr result(result&&) CMT_NOEXCEPT = default; - constexpr result(ErrEnum error) noexcept : m_error(error) {} + constexpr result(ErrEnum error) CMT_NOEXCEPT : m_error(error) {} template <typename ValueInit, CMT_ENABLE_IF(std::is_constructible<value_type, ValueInit>::value)> - constexpr result(ValueInit&& value) noexcept : m_value(std::forward<ValueInit>(value)), m_error(OkValue) + constexpr result(ValueInit&& value) CMT_NOEXCEPT : m_value(std::forward<ValueInit>(value)), + m_error(OkValue) { } - constexpr result(const Type& value) noexcept : m_value(value), m_error(OkValue) {} - constexpr result(Type&& value) noexcept : m_value(std::move(value)), m_error(OkValue) {} + constexpr result(const Type& value) CMT_NOEXCEPT : m_value(value), m_error(OkValue) {} + constexpr result(Type&& value) CMT_NOEXCEPT : m_value(std::move(value)), m_error(OkValue) {} constexpr explicit operator bool() const { return m_error == OkValue; } constexpr const_reference operator*() const { return m_value; } diff --git a/include/kfr/cometa/string.hpp b/include/kfr/cometa/string.hpp @@ -27,7 +27,7 @@ template <typename T> struct representation { using type = T; - static constexpr const T& get(const T& value) noexcept { return value; } + static constexpr const T& get(const T& value) CMT_NOEXCEPT { return value; } }; template <typename T> @@ -175,7 +175,7 @@ CMT_INLINE auto pack_value(const fmt_t<T, t, width, prec>& value) } template <typename T> -CMT_INLINE auto pack_value(const T& value) +CMT_INLINE auto pack_value(const T&) { return pack_value(type_name<T>()); } @@ -218,7 +218,7 @@ CMT_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& template <size_t N1, size_t Nto> CMT_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt) { - return fmt_replace_impl(str, newfmt, csizeseq_t<N1 - 3 + Nto - 1>()); + return fmt_replace_impl(str, newfmt, csizeseq<N1 - 3 + Nto - 1>); } inline std::string replace_one(const std::string& str, const std::string& from, const std::string& to) @@ -305,7 +305,7 @@ struct print_t } }; -#ifdef CMT_COMPILER_GNU +#if defined CMT_COMPILER_GNU && !defined(CMT_COMPILER_INTEL) template <typename Char, Char... chars> constexpr format_t<chars...> operator""_format() diff --git a/include/kfr/cpuid.hpp b/include/kfr/cpuid.hpp @@ -1,26 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "cpuid/cpuid.hpp" -#include "cpuid/cpuid_auto.hpp" diff --git a/include/kfr/cpuid/cpuid.hpp b/include/kfr/cpuid/cpuid.hpp @@ -1,297 +0,0 @@ -/** @addtogroup cpuid - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#ifdef _MSC_VER -#include <intrin.h> -#endif - -#include "../base/platform.hpp" -#include "../base/types.hpp" -#include <cstring> - -namespace kfr -{ -#ifdef CMT_ARCH_X86 - -struct cpu_features -{ - u32 max; - u32 exmax; - u32 isIntel : 1; - u32 isAMD : 1; - u32 has3DNOW : 1; - u32 has3DNOWEXT : 1; - u32 hasABM : 1; - u32 hasADX : 1; - u32 hasAES : 1; - u32 hasAVX : 1; - u32 hasAVX2 : 1; - u32 hasAVXOSSUPPORT : 1; - u32 hasAVX512OSSUPPORT : 1; - u32 hasAVX512CD : 1; - u32 hasAVX512ER : 1; - u32 hasAVX512F : 1; - u32 hasAVX512DQ : 1; - u32 hasAVX512PF : 1; - u32 hasAVX512BW : 1; - u32 hasAVX512VL : 1; - u32 hasBMI1 : 1; - u32 hasBMI2 : 1; - u32 hasCLFSH : 1; - u32 hasCMOV : 1; - u32 hasCMPXCHG16B : 1; - u32 hasCX8 : 1; - u32 hasERMS : 1; - u32 hasF16C : 1; - u32 hasFMA : 1; - u32 hasFSGSBASE : 1; - u32 hasFXSR : 1; - u32 hasHLE : 1; - u32 hasINVPCID : 1; - u32 hasLAHF : 1; - u32 hasLZCNT : 1; - u32 hasMMX : 1; - u32 hasMMXEXT : 1; - u32 hasMONITOR : 1; - u32 hasMOVBE : 1; - u32 hasMSR : 1; - u32 hasOSXSAVE : 1; - u32 hasPCLMULQDQ : 1; - u32 hasPOPCNT : 1; - u32 hasPREFETCHWT1 : 1; - u32 hasRDRAND : 1; - u32 hasRDSEED : 1; - u32 hasRDTSCP : 1; - u32 hasRTM : 1; - u32 hasSEP : 1; - u32 hasSHA : 1; - u32 hasSSE : 1; - u32 hasSSE2 : 1; - u32 hasSSE3 : 1; - u32 hasSSE41 : 1; - u32 hasSSE42 : 1; - u32 hasSSE4a : 1; - u32 hasSSSE3 : 1; - u32 hasSYSCALL : 1; - u32 hasTBM : 1; - u32 hasXOP : 1; - u32 hasXSAVE : 1; - u32 padding1 : 6; - char vendor[17]; - char model[49]; - char padding2[2]; -}; - -namespace internal -{ - -struct cpu_data -{ - u32 data[4]; -}; - -#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG -CMT_INLINE u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx) -{ - __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc)); - return 1; -} -CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) -{ - get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]); -} -CMT_INLINE u32 get_xcr0() -{ - u32 xcr0; - __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); - return xcr0; -} -#elif defined CMT_COMPILER_MSVC - -CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) { __cpuidex((int*)ptr, (int)func, (int)subfunc); } -CMT_INLINE u32 get_xcr0() -{ -#ifdef _XCR_XFEATURE_ENABLED_MASK - unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); - return (u32)Result; -#else - return 0; -#endif -} -#endif - -template <size_t = 0> -cpu_t detect_cpu() -{ - cpu_features c; - memset(&c, 0, sizeof(c)); - cpu_data data0; - cpu_data exdata0; - - u32 f_1_ECX(0); - u32 f_1_EDX(0); - u32 f_7_EBX(0); - u32 f_7_ECX(0); - u32 f_81_ECX(0); - u32 f_81_EDX(0); - - cpuid(data0.data, 0); - c.max = static_cast<u32>(data0.data[0]); - cpuid(exdata0.data, 0x80000000); - c.exmax = static_cast<u32>(exdata0.data[0]); - - *ptr_cast<u32>(c.vendor) = static_cast<u32>(data0.data[1]); - *ptr_cast<u32>(c.vendor + 4) = static_cast<u32>(data0.data[3]); - *ptr_cast<u32>(c.vendor + 8) = static_cast<u32>(data0.data[2]); - - c.isIntel = strncmp(c.vendor, "GenuineIntel", sizeof(c.vendor)) == 0 ? 1 : 0; - c.isAMD = strncmp(c.vendor, "AuthenticAMD", sizeof(c.vendor)) == 0 ? 1 : 0; - - if (c.max >= 1) - { - cpu_data data1; - cpuid(data1.data, 1); - f_1_ECX = static_cast<u32>(data1.data[2]); - f_1_EDX = static_cast<u32>(data1.data[3]); - } - - if (c.max >= 7) - { - cpu_data data7; - cpuid(data7.data, 7); - f_7_EBX = static_cast<u32>(data7.data[1]); - f_7_ECX = static_cast<u32>(data7.data[2]); - } - - if (c.exmax >= 0x80000001) - { - cpu_data data81; - cpuid(data81.data, 0x80000001); - f_81_ECX = static_cast<u32>(data81.data[2]); - f_81_EDX = static_cast<u32>(data81.data[3]); - } - - if (c.exmax >= 0x80000004) - { - cpu_data data82; - cpu_data data83; - cpu_data data84; - cpuid(data82.data, 0x80000002); - cpuid(data83.data, 0x80000003); - cpuid(data84.data, 0x80000004); - memcpy(c.model, data82.data, sizeof(cpu_data)); - memcpy(c.model + 16, data83.data, sizeof(cpu_data)); - memcpy(c.model + 32, data84.data, sizeof(cpu_data)); - } - - c.hasSSE3 = f_1_ECX >> 0 & 1; - c.hasPCLMULQDQ = f_1_ECX >> 1 & 1; - c.hasMONITOR = f_1_ECX >> 3 & 1; - c.hasSSSE3 = f_1_ECX >> 9 & 1; - c.hasFMA = f_1_ECX >> 12 & 1; - c.hasCMPXCHG16B = f_1_ECX >> 13 & 1; - c.hasSSE41 = f_1_ECX >> 19 & 1; - c.hasSSE42 = f_1_ECX >> 20 & 1; - c.hasMOVBE = f_1_ECX >> 22 & 1; - c.hasPOPCNT = f_1_ECX >> 23 & 1; - c.hasAES = f_1_ECX >> 25 & 1; - c.hasXSAVE = f_1_ECX >> 26 & 1; - c.hasOSXSAVE = f_1_ECX >> 27 & 1; - c.hasAVX = f_1_ECX >> 28 & 1; - c.hasF16C = f_1_ECX >> 29 & 1; - c.hasRDRAND = f_1_ECX >> 30 & 1; - c.hasMSR = f_1_EDX >> 5 & 1; - c.hasCX8 = f_1_EDX >> 8 & 1; - c.hasSEP = f_1_EDX >> 11 & 1; - c.hasCMOV = f_1_EDX >> 15 & 1; - c.hasCLFSH = f_1_EDX >> 19 & 1; - c.hasMMX = f_1_EDX >> 23 & 1; - c.hasFXSR = f_1_EDX >> 24 & 1; - c.hasSSE = f_1_EDX >> 25 & 1; - c.hasSSE2 = f_1_EDX >> 26 & 1; - c.hasFSGSBASE = f_7_EBX >> 0 & 1; - c.hasBMI1 = f_7_EBX >> 3 & 1; - c.hasHLE = c.isIntel && f_7_EBX >> 4 & 1; - c.hasAVX2 = f_7_EBX >> 5 & 1; - c.hasBMI2 = f_7_EBX >> 8 & 1; - c.hasERMS = f_7_EBX >> 9 & 1; - c.hasINVPCID = f_7_EBX >> 10 & 1; - c.hasRTM = c.isIntel && f_7_EBX >> 11 & 1; - c.hasAVX512F = f_7_EBX >> 16 & 1; - c.hasAVX512DQ = f_7_EBX >> 17 & 1; - c.hasRDSEED = f_7_EBX >> 18 & 1; - c.hasADX = f_7_EBX >> 19 & 1; - c.hasAVX512PF = f_7_EBX >> 26 & 1; - c.hasAVX512ER = f_7_EBX >> 27 & 1; - c.hasAVX512CD = f_7_EBX >> 28 & 1; - c.hasSHA = f_7_EBX >> 29 & 1; - c.hasAVX512BW = f_7_EBX >> 30 & 1; - c.hasAVX512VL = f_7_EBX >> 31 & 1; - c.hasPREFETCHWT1 = f_7_ECX >> 0 & 1; - c.hasLAHF = f_81_ECX >> 0 & 1; - c.hasLZCNT = c.isIntel && f_81_ECX >> 5 & 1; - c.hasABM = c.isAMD && f_81_ECX >> 5 & 1; - c.hasSSE4a = c.isAMD && f_81_ECX >> 6 & 1; - c.hasXOP = c.isAMD && f_81_ECX >> 11 & 1; - c.hasTBM = c.isAMD && f_81_ECX >> 21 & 1; - c.hasSYSCALL = c.isIntel && f_81_EDX >> 11 & 1; - c.hasMMXEXT = c.isAMD && f_81_EDX >> 22 & 1; - c.hasRDTSCP = c.isIntel && f_81_EDX >> 27 & 1; - c.has3DNOWEXT = c.isAMD && f_81_EDX >> 30 & 1; - c.has3DNOW = c.isAMD && f_81_EDX >> 31 & 1; - - c.hasAVXOSSUPPORT = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06; - c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0; - - if (c.hasAVX512F && c.hasAVX512CD && c.hasAVX512VL && c.hasAVX512BW && c.hasAVX512DQ && - c.hasAVX512OSSUPPORT) - return cpu_t::avx512; - if (c.hasAVX2 && c.hasAVXOSSUPPORT) - return cpu_t::avx2; - if (c.hasAVX && c.hasAVXOSSUPPORT) - return cpu_t::avx1; - if (c.hasSSE41) - return cpu_t::sse41; - if (c.hasSSSE3) - return cpu_t::ssse3; - if (c.hasSSE3) - return cpu_t::sse3; - if (c.hasSSE2) - return cpu_t::sse2; - return cpu_t::lowest; -} -} // namespace internal -#else - -template <size_t = 0> -cpu_t detect_cpu() -{ - return cpu_t::native; -} - -#endif -} // namespace kfr diff --git a/include/kfr/cpuid/cpuid_auto.hpp b/include/kfr/cpuid/cpuid_auto.hpp @@ -1,60 +0,0 @@ -/** @addtogroup cpuid - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "cpuid.hpp" - -namespace kfr -{ -namespace internal -{ - -CMT_INLINE cpu_t& cpu_v() -{ - static cpu_t v1 = cpu_t::native; - return v1; -} - -CMT_INLINE char init_cpu_v() -{ - cpu_v() = detect_cpu<0>(); - return 0; -} - -CMT_INLINE char init_dummyvar() -{ - static char dummy = init_cpu_v(); - return dummy; -} - -static char dummyvar = init_dummyvar(); -} // namespace internal - -/** - * @brief Returns cpu instruction set detected at runtime. - */ -CMT_INLINE cpu_t get_cpu() { return internal::cpu_v(); } -} // namespace kfr diff --git a/include/kfr/data/sincos.hpp b/include/kfr/data/sincos.hpp @@ -1,192 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../base/kfr.h" -#include "../base/types.hpp" -#include <cstdint> - -namespace kfr -{ - -namespace data -{ - -template <typename T> -constexpr T c_sin_table[65] = { - /* sin(2*pi* 0/ 256) */ f32(0.0), - /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547), - /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475), - /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133), - /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114), - /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752), - /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062), - /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663), - /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277), - /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484), - /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321), - /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421), - /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915), - /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934), - /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778), - /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023), - /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613), - /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119), - /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046), - /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832), - /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575), - /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397), - /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608), - /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603), - /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749), - /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776), - /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119), - /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594), - /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757), - /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769), - /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288), - /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455), - /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848), - /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831), - /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729), - /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404), - /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971), - /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565), - /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796), - /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471), - /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386), - /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198), - /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425), - /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108), - /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084), - /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006), - /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101), - /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447), - /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224), - /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295), - /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856), - /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522), - /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828), - /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631), - /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569), - /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644), - /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739), - /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601), - /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837), - /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108), - /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755), - /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717), - /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432), - /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501), - /* sin(2*pi* 64/ 256) */ f32(1.0000000000000000000000000000000000000000) -}; - -// data generated by mpfr -template <> -constexpr f64 c_sin_table<f64>[65] = { - /* sin(2*pi* 0/ 256) */ f64(0.0), - /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547), - /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475), - /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133), - /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114), - /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752), - /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062), - /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663), - /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277), - /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484), - /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321), - /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421), - /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915), - /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934), - /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778), - /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023), - /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613), - /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119), - /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046), - /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832), - /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575), - /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397), - /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608), - /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603), - /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749), - /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776), - /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119), - /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594), - /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757), - /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769), - /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288), - /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455), - /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848), - /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831), - /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729), - /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404), - /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971), - /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565), - /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796), - /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471), - /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386), - /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198), - /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425), - /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108), - /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084), - /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006), - /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101), - /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447), - /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224), - /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295), - /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856), - /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522), - /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828), - /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631), - /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569), - /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644), - /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739), - /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601), - /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837), - /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108), - /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755), - /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717), - /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432), - /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501), - /* sin(2*pi* 64/ 256) */ f64(1.0000000000000000000000000000000000000000) -}; - -} // namespace data - -template <typename T> -constexpr inline T sin_using_table_256(size_t k) -{ - return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128]; -} - -template <typename T> -constexpr inline T sin_using_table(size_t size, size_t k) -{ - return sin_using_table_256<T>((k * 256 / size) % 256); -} -template <typename T> -constexpr inline T cos_using_table(size_t size, size_t k) -{ - return sin_using_table<T>(size, k + size / 4); -} -} // namespace kfr diff --git a/include/kfr/dft/cache.hpp b/include/kfr/dft/cache.hpp @@ -32,6 +32,8 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ template <typename T> using dft_plan_ptr = std::shared_ptr<const dft_plan<T>>; @@ -166,4 +168,5 @@ univector<T> irealdft(const univector<complex<T>, Tag>& input) dft->execute(output, input, temp); return output; } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dft +/** @addtogroup convolution * @{ */ /* @@ -25,12 +25,12 @@ */ #pragma once -#include "../base/complex.hpp" -#include "../base/constants.hpp" #include "../base/filter.hpp" #include "../base/memory.hpp" -#include "../base/read_write.hpp" -#include "../base/vec.hpp" +#include "../simd/complex.hpp" +#include "../simd/constants.hpp" +#include "../simd/read_write.hpp" +#include "../simd/vec.hpp" #include "cache.hpp" #include "fft.hpp" @@ -42,8 +42,10 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") namespace kfr { +inline namespace CMT_ARCH_NAME +{ -namespace internal +namespace intrinsics { template <typename T> univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2); @@ -51,27 +53,27 @@ template <typename T> univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2); template <typename T> univector<T> autocorrelate(const univector_ref<const T>& src1); -} // namespace internal +} // namespace intrinsics /// @brief Convolution template <typename T, univector_tag Tag1, univector_tag Tag2> univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2) { - return internal::convolve(src1.slice(), src2.slice()); + return intrinsics::convolve(src1.slice(), src2.slice()); } /// @brief Correlation template <typename T, univector_tag Tag1, univector_tag Tag2> univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2) { - return internal::correlate(src1.slice(), src2.slice()); + return intrinsics::correlate(src1.slice(), src2.slice()); } /// @brief Auto-correlation template <typename T, univector_tag Tag1> univector<T> autocorrelate(const univector<T, Tag1>& src) { - return internal::autocorrelate(src.slice()); + return intrinsics::autocorrelate(src.slice()); } /// @brief Convolution using Filter API @@ -91,12 +93,12 @@ protected: } void process_buffer(T* output, const T* input, size_t size) final; + const size_t size; + const size_t block_size; const dft_plan_real<T> fft; univector<u8> temp; std::vector<univector<complex<T>>> segments; std::vector<univector<complex<T>>> ir_segments; - const size_t size; - const size_t block_size; size_t input_position; univector<T> saved_input; univector<complex<T>> premul; @@ -105,6 +107,6 @@ protected: univector<T> overlap; size_t position; }; - +} // namespace CMT_ARCH_NAME } // namespace kfr CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/data/bitrev.hpp b/include/kfr/dft/data/bitrev.hpp diff --git a/include/kfr/dft/data/sincos.hpp b/include/kfr/dft/data/sincos.hpp @@ -0,0 +1,192 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../kfr.h" +#include "../../simd/types.hpp" +#include <cstdint> + +namespace kfr +{ + +namespace data +{ + +template <typename T> +constexpr T c_sin_table[65] = { + /* sin(2*pi* 0/ 256) */ f32(0.0), + /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547), + /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475), + /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133), + /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114), + /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752), + /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062), + /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663), + /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277), + /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484), + /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321), + /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421), + /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915), + /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934), + /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778), + /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023), + /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613), + /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119), + /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046), + /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832), + /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575), + /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397), + /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608), + /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603), + /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749), + /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776), + /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119), + /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594), + /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757), + /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769), + /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288), + /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455), + /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848), + /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831), + /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729), + /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404), + /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971), + /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565), + /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796), + /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471), + /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386), + /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198), + /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425), + /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108), + /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084), + /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006), + /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101), + /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447), + /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224), + /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295), + /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856), + /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522), + /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828), + /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631), + /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569), + /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644), + /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739), + /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601), + /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837), + /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108), + /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755), + /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717), + /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432), + /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501), + /* sin(2*pi* 64/ 256) */ f32(1.0000000000000000000000000000000000000000) +}; + +// data generated by mpfr +template <> +constexpr f64 c_sin_table<f64>[65] = { + /* sin(2*pi* 0/ 256) */ f64(0.0), + /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547), + /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475), + /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133), + /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114), + /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752), + /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062), + /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663), + /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277), + /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484), + /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321), + /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421), + /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915), + /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934), + /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778), + /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023), + /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613), + /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119), + /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046), + /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832), + /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575), + /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397), + /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608), + /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603), + /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749), + /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776), + /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119), + /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594), + /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757), + /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769), + /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288), + /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455), + /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848), + /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831), + /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729), + /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404), + /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971), + /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565), + /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796), + /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471), + /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386), + /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198), + /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425), + /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108), + /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084), + /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006), + /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101), + /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447), + /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224), + /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295), + /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856), + /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522), + /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828), + /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631), + /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569), + /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644), + /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739), + /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601), + /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837), + /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108), + /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755), + /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717), + /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432), + /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501), + /* sin(2*pi* 64/ 256) */ f64(1.0000000000000000000000000000000000000000) +}; + +} // namespace data + +template <typename T> +constexpr inline T sin_using_table_256(size_t k) +{ + return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128]; +} + +template <typename T> +constexpr inline T sin_using_table(size_t size, size_t k) +{ + return sin_using_table_256<T>((k * 256 / size) % 256); +} +template <typename T> +constexpr inline T cos_using_table(size_t size, size_t k) +{ + return sin_using_table<T>(size, k + size / 4); +} +} // namespace kfr diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp @@ -25,13 +25,13 @@ */ #pragma once -#include "../base/complex.hpp" -#include "../base/constants.hpp" #include "../base/memory.hpp" -#include "../base/read_write.hpp" #include "../base/small_buffer.hpp" #include "../base/univector.hpp" -#include "../base/vec.hpp" +#include "../simd/complex.hpp" +#include "../simd/constants.hpp" +#include "../simd/read_write.hpp" +#include "../simd/vec.hpp" CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wshadow") @@ -57,9 +57,12 @@ enum class dft_type enum class dft_order { normal, - internal, // possibly bit/digit-reversed, implementation-defined, faster + internal, // possibly bit/digit-reversed, implementation-defined, faster to compute }; +inline namespace CMT_ARCH_NAME +{ + template <typename T> struct dft_stage; @@ -76,7 +79,8 @@ struct dft_plan void dump() const; - KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const + KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp, + bool inverse = false) const { if (inverse) execute_dft(ctrue, out, in, temp); @@ -85,14 +89,15 @@ struct dft_plan } ~dft_plan(); template <bool inverse> - KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, cbool_t<inverse> inv) const + KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp, + cbool_t<inverse> inv) const { execute_dft(inv, out, in, temp); } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> - KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in, - univector<u8, Tag3>& temp, bool inverse = false) const + KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in, + univector<u8, Tag3>& temp, bool inverse = false) const { if (inverse) execute_dft(ctrue, out.data(), in.data(), temp.data()); @@ -100,8 +105,8 @@ struct dft_plan execute_dft(cfalse, out.data(), in.data(), temp.data()); } template <bool inverse, univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> - KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in, - univector<u8, Tag3>& temp, cbool_t<inverse> inv) const + KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in, + univector<u8, Tag3>& temp, cbool_t<inverse> inv) const { execute_dft(inv, out.data(), in.data(), temp.data()); } @@ -128,6 +133,9 @@ protected: const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, const complex<T>* scratch, bool in_scratch) const; complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const; + + void init_dft(size_t size, dft_order order); + void init_fft(size_t size, dft_order order); }; enum class dft_pack_format @@ -155,14 +163,14 @@ struct dft_plan_real : dft_plan<T> void execute(univector<complex<T>, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&, cbool_t<inverse>) const = delete; - KFR_INTRIN void execute(complex<T>* out, const T* in, u8* temp, - dft_pack_format fmt = dft_pack_format::CCs) const + KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp, + dft_pack_format fmt = dft_pack_format::CCs) const { this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp); to_fmt(out, fmt); } - KFR_INTRIN void execute(T* out, const complex<T>* in, u8* temp, - dft_pack_format fmt = dft_pack_format::CCs) const + KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp, + dft_pack_format fmt = dft_pack_format::CCs) const { complex<T>* outdata = ptr_cast<complex<T>>(out); from_fmt(outdata, in, fmt); @@ -170,15 +178,17 @@ struct dft_plan_real : dft_plan<T> } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> - KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in, - univector<u8, Tag3>& temp, dft_pack_format fmt = dft_pack_format::CCs) const + KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in, + univector<u8, Tag3>& temp, + dft_pack_format fmt = dft_pack_format::CCs) const { this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data()); to_fmt(out.data(), fmt); } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> - KFR_INTRIN void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in, - univector<u8, Tag3>& temp, dft_pack_format fmt = dft_pack_format::CCs) const + KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in, + univector<u8, Tag3>& temp, + dft_pack_format fmt = dft_pack_format::CCs) const { complex<T>* outdata = ptr_cast<complex<T>>(out.data()); from_fmt(outdata, in.data(), fmt); @@ -230,6 +240,7 @@ void fft_multiply_accumulate(univector<complex<T>, Tag1>& dest, const univector< if (fmt == dft_pack_format::Perm) dest[0] = f0; } +} // namespace CMT_ARCH_NAME } // namespace kfr CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp @@ -25,19 +25,21 @@ */ #pragma once -#include "../../base/complex.hpp" -#include "../../base/constants.hpp" -#include "../../base/digitreverse.hpp" -#include "../../base/vec.hpp" +#include "../../simd/complex.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/digitreverse.hpp" +#include "../../simd/vec.hpp" -#include "../../data/bitrev.hpp" +#include "../data/bitrev.hpp" #include "ft.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ -namespace internal +namespace intrinsics { constexpr bool fft_reorder_aligned = false; @@ -74,7 +76,7 @@ CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits) } template <size_t log2n, size_t bitrev, typename T> -KFR_INTRIN void fft_reorder_swap(T* inout, size_t i) +KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i) { using cxx = cvec<T, 16>; constexpr size_t N = 1 << log2n; @@ -86,7 +88,7 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i) } template <size_t log2n, size_t bitrev, typename T> -KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j) +KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j) { CMT_ASSUME(i != j); using cxx = cvec<T, 16>; @@ -103,7 +105,7 @@ KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j) } template <size_t log2n, size_t bitrev, typename T> -KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j) +KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j) { CMT_ASSUME(i != j); using cxx = cvec<T, 16>; @@ -120,25 +122,25 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j) } template <size_t log2n, size_t bitrev, typename T> -KFR_INTRIN void fft_reorder_swap(complex<T>* inout, size_t i) +KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i) { fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2); } template <size_t log2n, size_t bitrev, typename T> -KFR_INTRIN void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1) +KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1) { fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2); } template <size_t log2n, size_t bitrev, typename T> -KFR_INTRIN void fft_reorder_swap(complex<T>* inout, size_t i, size_t j) +KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j) { fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2); } template <typename T> -KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<11>) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>) { fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4); fft_reorder_swap<11>(inout, 1 * 4, 64 * 4); @@ -207,7 +209,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<11>) } template <typename T> -KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<7>) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>) { constexpr size_t bitrev = 2; fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4); @@ -217,7 +219,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<7>) } template <typename T> -KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<8>) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>) { constexpr size_t bitrev = 4; fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4); @@ -231,7 +233,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<8>) } template <typename T> -KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>) { constexpr size_t bitrev = 2; fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4); @@ -253,14 +255,14 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>) } template <typename T, bool use_br2> -void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) +KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) { cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, digitreverse<(use_br2 ? 2 : 4), 2>(value)); } template <typename T, bool use_br2> -KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>) +KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>) { CMT_ASSUME(i != j); const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4); @@ -270,7 +272,7 @@ KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbo } template <typename T> -KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) { const size_t N = 1 << log2n; const size_t N4 = N / 4; @@ -305,7 +307,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) } template <typename T> -KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) { const size_t N = size_t(1) << log2n; const size_t N4 = N / 4; @@ -386,5 +388,6 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) i += istep; } } -} // namespace internal +} // namespace intrinsics +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp @@ -27,8 +27,10 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ -namespace internal +namespace intrinsics { template <typename T> @@ -76,18 +78,19 @@ univector<T> autocorrelate(const univector_ref<const T>& src1) return result; } -} // namespace internal +} // namespace intrinsics template <typename T> convolve_filter<T>::convolve_filter(size_t size, size_t block_size) - : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size), + : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size), segments((size + block_size - 1) / block_size) + { } template <typename T> convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size) - : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)), + : size(data.size()), block_size(next_poweroftwo(block_size)), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size), segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), @@ -124,8 +127,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) while (processed < size) { const size_t processing = std::min(size - processed, block_size - input_position); - internal::builtin_memcpy(saved_input.data() + input_position, input + processed, - processing * sizeof(T)); + builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T)); process(scratch, padded(saved_input)); fft.execute(segments[position], scratch, temp, dft_pack_format::Perm); @@ -152,7 +154,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) input_position = 0; process(saved_input, zeros()); - internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T)); + builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T)); position = position > 0 ? position - 1 : segments.size() - 1; } @@ -161,7 +163,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) } } -namespace internal +namespace intrinsics { template univector<float> convolve<float>(const univector_ref<const float>&, @@ -171,7 +173,7 @@ template univector<float> correlate<float>(const univector_ref<const float>&, template univector<float> autocorrelate<float>(const univector_ref<const float>&); -} // namespace internal +} // namespace intrinsics template convolve_filter<float>::convolve_filter(size_t, size_t); @@ -181,7 +183,7 @@ template void convolve_filter<float>::set_data(const univector<float>&); template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size); -namespace internal +namespace intrinsics { template univector<double> convolve<double>(const univector_ref<const double>&, @@ -191,7 +193,7 @@ template univector<double> correlate<double>(const univector_ref<const double>&, template univector<double> autocorrelate<double>(const univector_ref<const double>&); -} // namespace internal +} // namespace intrinsics template convolve_filter<double>::convolve_filter(size_t, size_t); @@ -200,5 +202,5 @@ template convolve_filter<double>::convolve_filter(const univector<double>&, size template void convolve_filter<double>::set_data(const univector<double>&); template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size); - +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp @@ -0,0 +1,123 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../dft_c.h" + +#include "../../base/basic_expressions.hpp" +#include "../../math/complex_math.hpp" +#include "../../testo/assert.hpp" +#include "../cache.hpp" +#include "../fft.hpp" +#include "bitrev.hpp" +#include "ft.hpp" + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ + +#define DFT_ASSERT TESTO_ASSERT_INACTIVE + +template <typename T> +constexpr size_t fft_vector_width = vector_width<T>; + +using cdirect_t = cfalse_t; +using cinvert_t = ctrue_t; + +template <typename T> +struct dft_stage +{ + size_t radix = 0; + size_t stage_size = 0; + size_t data_size = 0; + size_t temp_size = 0; + u8* data = nullptr; + size_t repeats = 1; + size_t out_offset = 0; + size_t blocks = 0; + const char* name = nullptr; + bool recursion = false; + bool can_inplace = true; + bool inplace = false; + bool to_scratch = false; + bool need_reorder = true; + + void initialize(size_t size) { do_initialize(size); } + + virtual void dump() const + { + printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix, + stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace, + to_scratch); + } + + KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) + { + do_execute(cdirect_t(), out, in, temp); + } + KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) + { + do_execute(cinvert_t(), out, in, temp); + } + virtual ~dft_stage() {} + +protected: + virtual void do_initialize(size_t) {} + virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0; + virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0; +}; + +#define DFT_STAGE_FN \ + void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \ + { \ + return do_execute<false>(out, in, temp); \ + } \ + void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override \ + { \ + return do_execute<true>(out, in, temp); \ + } + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wassume") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") +#endif + +template <typename T> +template <typename Stage, typename... Args> +void dft_plan<T>::add_stage(Args... args) +{ + dft_stage<T>* stage = new Stage(args...); + stage->need_reorder = need_reorder; + this->data_size += stage->data_size; + this->temp_size += stage->temp_size; + stages.push_back(dft_stage_ptr(stage)); +} + +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp @@ -23,20 +23,17 @@ disclosing the source code of your own applications. See https://www.kfrlib.com for details. */ +#pragma once -#include "../dft_c.h" - -#include "../../base/basic_expressions.hpp" -#include "../../testo/assert.hpp" -#include "../cache.hpp" -#include "../fft.hpp" -#include "bitrev.hpp" -#include "ft.hpp" +#include "dft-fft.hpp" CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wshadow") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") #endif +#if CMT_HAS_WARNING("-Wunused-lambda-capture") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture") +#endif CMT_PRAGMA_MSVC(warning(push)) CMT_PRAGMA_MSVC(warning(disable : 4100)) @@ -44,439 +41,15 @@ CMT_PRAGMA_MSVC(warning(disable : 4100)) namespace kfr { -constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{}; - -#define DFT_ASSERT TESTO_ASSERT_INACTIVE - -template <typename T> -constexpr size_t fft_vector_width = platform<T>::vector_width; - -using cdirect_t = cfalse_t; -using cinvert_t = ctrue_t; - -template <typename T> -struct dft_stage -{ - size_t radix = 0; - size_t stage_size = 0; - size_t data_size = 0; - size_t temp_size = 0; - u8* data = nullptr; - size_t repeats = 1; - size_t out_offset = 0; - size_t blocks = 0; - const char* name = nullptr; - bool recursion = false; - bool can_inplace = true; - bool inplace = false; - bool to_scratch = false; - bool need_reorder = true; - - void initialize(size_t size) { do_initialize(size); } - - virtual void dump() const - { - printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix, - stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace, - to_scratch); - } - - KFR_INTRIN void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) - { - do_execute(cdirect_t(), out, in, temp); - } - KFR_INTRIN void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) - { - do_execute(cinvert_t(), out, in, temp); - } - virtual ~dft_stage() {} - -protected: - virtual void do_initialize(size_t) {} - virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0; - virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0; -}; - -#define DFT_STAGE_FN \ - void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \ - { \ - return do_execute<false>(out, in, temp); \ - } \ - void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override \ - { \ - return do_execute<true>(out, in, temp); \ - } - -CMT_PRAGMA_GNU(GCC diagnostic push) -#if CMT_HAS_WARNING("-Wassume") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") -#endif - -namespace internal -{ - -template <size_t width, bool inverse, typename T> -KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>, - const cvec<T, width>& w, const cvec<T, width>& tw) -{ - cvec<T, width> ww = w; - cvec<T, width> tw_ = tw; - cvec<T, width> b1 = ww * dupeven(tw_); - ww = swap<2>(ww); - - if (inverse) - tw_ = -(tw_); - ww = subadd(b1, ww * dupodd(tw_)); - return ww; -} - -template <size_t width, bool use_br2, bool inverse, bool aligned, typename T> -KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>, - cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in, - const complex<T>* twiddle) -{ - const size_t N4 = N / 4; - cvec<T, width> w1, w2, w3; - - cvec<T, width> sum02, sum13, diff02, diff13; - - cvec<T, width> a0, a1, a2, a3; - a0 = cread<width, aligned>(in + 0); - a2 = cread<width, aligned>(in + N4 * 2); - sum02 = a0 + a2; - - a1 = cread<width, aligned>(in + N4); - a3 = cread<width, aligned>(in + N4 * 3); - sum13 = a1 + a3; - - cwrite<width, aligned>(out, sum02 + sum13); - w2 = sum02 - sum13; - cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2), - radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2, - cread<width, true>(twiddle + width))); - diff02 = a0 - a2; - diff13 = a1 - a3; - if (inverse) - { - diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T())); - diff13 = swap<2>(diff13); - } - else - { - diff13 = swap<2>(diff13); - diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T())); - } - - w1 = diff02 + diff13; - - cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1), - radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1, - cread<width, true>(twiddle + 0))); - w3 = diff02 - diff13; - cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), - w3, cread<width, true>(twiddle + width * 2))); -} - -template <size_t width, bool inverse, typename T> -KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>, - const cvec<T, width>& w, const cvec<T, width>& tw) -{ - vec<T, width> re1, im1, twre, twim; - split(w, re1, im1); - split(tw, twre, twim); - - const vec<T, width> b1re = re1 * twre; - const vec<T, width> b1im = im1 * twre; - if (inverse) - return concat(b1re + im1 * twim, b1im - re1 * twim); - else - return concat(b1re - im1 * twim, b1im + re1 * twim); -} - -template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T> -KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>, - cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, - const complex<T>* in, const complex<T>* twiddle) -{ - const size_t N4 = N / 4; - cvec<T, width> w1, w2, w3; - constexpr bool read_split = !splitin && splitout; - constexpr bool write_split = splitin && !splitout; - - vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3; - - split(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0); - split(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1); - split(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2); - split(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3); - - const vec<T, width> sum02re = re0 + re2; - const vec<T, width> sum02im = im0 + im2; - const vec<T, width> sum13re = re1 + re3; - const vec<T, width> sum13im = im1 + im3; - - cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im)); - w2 = concat(sum02re - sum13re, sum02im - sum13im); - cwrite_split<width, aligned, write_split>( - out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2, - cread<width, true>(twiddle + width))); - - const vec<T, width> diff02re = re0 - re2; - const vec<T, width> diff02im = im0 - im2; - const vec<T, width> diff13re = re1 - re3; - const vec<T, width> diff13im = im1 - im3; - - (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); - (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); - - cwrite_split<width, aligned, write_split>( - out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1, - cread<width, true>(twiddle + 0))); - cwrite_split<width, aligned, write_split>( - out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3, - cread<width, true>(twiddle + width * 2))); -} - -template <typename T> -CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size) -{ - if (n == 0) - { - return make_vector(static_cast<T>(1), static_cast<T>(0)); - } - else if (n == size / 4) - { - return make_vector(static_cast<T>(0), static_cast<T>(-1)); - } - else if (n == size / 2) - { - return make_vector(static_cast<T>(-1), static_cast<T>(0)); - } - else if (n == size * 3 / 4) - { - return make_vector(static_cast<T>(0), static_cast<T>(1)); - } - else - { - fbase kth = c_pi<fbase, 2> * (n / static_cast<fbase>(size)); - fbase tcos = +kfr::cos(kth); - fbase tsin = -kfr::sin(kth); - return make_vector(static_cast<T>(tcos), static_cast<T>(tsin)); - } -} - -template <typename T, size_t width> -KFR_SINTRIN void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size, - bool split_format) -{ - vec<T, 2 * width> result = T(); - CMT_LOOP_UNROLL - for (size_t i = 0; i < width; i++) - { - const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size); - result[i * 2] = r[0]; - result[i * 2 + 1] = r[1]; - } - if (split_format) - ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result); - else - ref_cast<cvec<T, width>>(twiddle[0]) = result; - twiddle += width; -} - -template <typename T, size_t width> -CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format) -{ - const size_t count = stage_size / 4; - size_t nnstep = size / stage_size; - DFT_ASSERT(width <= count); - CMT_LOOP_NOUNROLL - for (size_t n = 0; n < count; n += width) - { - initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format); - initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format); - initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format); - } -} - -#if defined CMT_ARCH_SSE -#ifdef CMT_COMPILER_GNU -#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0); -#else -#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0); -#endif -#else -#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr)); -#endif - -template <typename T> -KFR_SINTRIN void prefetch_one(const complex<T>* in) -{ - KFR_PREFETCH(in); -} - -template <typename T> -KFR_SINTRIN void prefetch_four(size_t stride, const complex<T>* in) -{ - KFR_PREFETCH(in); - KFR_PREFETCH(in + stride); - KFR_PREFETCH(in + stride * 2); - KFR_PREFETCH(in + stride * 3); -} - -template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, - bool inverse, bool aligned, typename T> -KFR_SINTRIN cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>, cbool_t<splitin>, - cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, - complex<T>* out, const complex<T>* in, const complex<T>*& twiddle) -{ - constexpr static size_t prefetch_offset = width * 8; - const auto N4 = N / csize_t<4>(); - const auto N43 = N4 * csize_t<3>(); - CMT_ASSUME(blocks > 0); - CMT_ASSUME(N > 0); - CMT_ASSUME(N4 > 0); - DFT_ASSERT(width <= N4); - CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++) - { - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (size_t n2 = 0; n2 < N4; n2 += width) - { - if (prefetch) - prefetch_four(N4, in + prefetch_offset); - radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(), - cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out, - in, twiddle + n2 * 3); - in += width; - out += width; - } - in += N43; - out += N43; - } - twiddle += N43; - return {}; -} - -template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> -KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>, - cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, - complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +inline namespace CMT_ARCH_NAME { - CMT_ASSUME(blocks > 0); - constexpr static size_t prefetch_offset = 32 * 4; - for (size_t b = 0; b < blocks; b++) - { - if (prefetch) - prefetch_four(csize_t<64>(), out + prefetch_offset); - cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; - split(cread_split<8, aligned, splitin>(out + 0), w0, w1); - split(cread_split<8, aligned, splitin>(out + 8), w2, w3); - split(cread_split<8, aligned, splitin>(out + 16), w4, w5); - split(cread_split<8, aligned, splitin>(out + 24), w6, w7); - - butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); - - w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); - w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); - w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); - w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); - w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); - w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); - w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); - - cvec<T, 8> z0, z1, z2, z3; - transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); - - butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); - cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3))); - out += 32; - } - return {}; -} - -template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> -KFR_SINTRIN ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, - cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, - complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) -{ - CMT_ASSUME(blocks > 0); - DFT_ASSERT(2 <= blocks); - constexpr static size_t prefetch_offset = width * 16; - for (size_t b = 0; b < blocks; b += 2) - { - if (prefetch) - prefetch_one(out + prefetch_offset); - - cvec<T, 8> vlo = cread<8, aligned>(out + 0); - cvec<T, 8> vhi = cread<8, aligned>(out + 8); - butterfly8<inverse>(vlo); - butterfly8<inverse>(vhi); - vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo); - vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi); - cwrite<8, aligned>(out, vlo); - cwrite<8, aligned>(out + 8, vhi); - out += 16; - } - return {}; -} - -template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> -KFR_SINTRIN ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, - cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, - complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) -{ - CMT_ASSUME(blocks > 0); - constexpr static size_t prefetch_offset = width * 4; - DFT_ASSERT(2 <= blocks); - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (size_t b = 0; b < blocks; b += 2) - { - if (prefetch) - prefetch_one(out + prefetch_offset); - - cvec<T, 16> vlo = cread<16, aligned>(out); - cvec<T, 16> vhi = cread<16, aligned>(out + 16); - butterfly4<4, inverse>(vlo); - butterfly4<4, inverse>(vhi); - apply_twiddles4<0, 4, 4, inverse>(vlo); - apply_twiddles4<0, 4, 4, inverse>(vhi); - vlo = digitreverse4<2>(vlo); - vhi = digitreverse4<2>(vhi); - butterfly4<4, inverse>(vlo); - butterfly4<4, inverse>(vhi); - - use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo); - use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi); - out += 32; - } - return {}; -} +constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{}; -template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> -KFR_SINTRIN ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, - cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, - complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +namespace intrinsics { - constexpr static size_t prefetch_offset = width * 4; - CMT_ASSUME(blocks > 0); - DFT_ASSERT(4 <= blocks); - CMT_LOOP_NOUNROLL - for (size_t b = 0; b < blocks; b += 4) - { - if (prefetch) - prefetch_one(out + prefetch_offset); - - cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out); - butterfly4<4, inverse>(v16); - cdigitreverse4_write<aligned>(out, v16); - - out += 4 * 4; - } - return {}; -} template <typename T> -static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) +void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) { complex<T>* twiddle = ptr_cast<complex<T>>(stage->data); const size_t N = stage->repeats * stage->radix; @@ -507,7 +80,7 @@ static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) template <typename T, size_t fixed_radix> struct dft_stage_fixed_impl : dft_stage<T> { - dft_stage_fixed_impl(size_t radix_, size_t iterations, size_t blocks) + dft_stage_fixed_impl(size_t, size_t iterations, size_t blocks) { this->name = type_name<decltype(*this)>(); this->radix = fixed_radix; @@ -523,11 +96,11 @@ struct dft_stage_fixed_impl : dft_stage<T> constexpr static size_t width = fixed_radix >= 7 ? fft_vector_width<T> / 2 : fixed_radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2; - virtual void do_initialize(size_t size) override final { dft_stage_fixed_initialize(this, width); } + virtual void do_initialize(size_t) override final { dft_stage_fixed_initialize(this, width); } DFT_STAGE_FN template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { const size_t Nord = this->repeats; const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); @@ -546,7 +119,7 @@ struct dft_stage_fixed_impl : dft_stage<T> template <typename T, size_t fixed_radix> struct dft_stage_fixed_final_impl : dft_stage<T> { - dft_stage_fixed_final_impl(size_t radix_, size_t iterations, size_t blocks) + dft_stage_fixed_final_impl(size_t, size_t iterations, size_t blocks) { this->name = type_name<decltype(*this)>(); this->radix = fixed_radix; @@ -561,10 +134,9 @@ struct dft_stage_fixed_final_impl : dft_stage<T> DFT_STAGE_FN template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { - const size_t b = this->blocks; - const size_t size = b * fixed_radix; + const size_t b = this->blocks; butterflies(b, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, b); } @@ -584,27 +156,32 @@ inline auto apply_conj(E& e, ctrue_t) /// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1] template <typename E> -struct fft_inverse : expression_base<E> +struct fft_inverse : internal::expression_with_arguments<E> { using value_type = value_type_of<E>; - CMT_INLINE fft_inverse(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {} + KFR_MEM_INTRINSIC fft_inverse(E&& expr) CMT_NOEXCEPT + : internal::expression_with_arguments<E>(std::forward<E>(expr)) + { + } - CMT_INLINE vec<value_type, 1> operator()(cinput_t input, size_t index, vec_t<value_type, 1>) const + friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, cinput_t input, + size_t index, vec_shape<value_type, 1>) { - return this->argument_first(input, index == 0 ? 0 : this->size() - index, vec_t<value_type, 1>()); + return self.argument_first(input, index == 0 ? 0 : self.size() - index, vec_shape<value_type, 1>()); } template <size_t N> - CMT_INLINE vec<value_type, N> operator()(cinput_t input, size_t index, vec_t<value_type, N>) const + friend KFR_MEM_INTRINSIC vec<value_type, N> get_elements(const fft_inverse& self, cinput_t input, + size_t index, vec_shape<value_type, N>) { if (index == 0) { return concat( - this->argument_first(input, index, vec_t<value_type, 1>()), - reverse(this->argument_first(input, this->size() - (N - 1), vec_t<value_type, N - 1>()))); + self.argument_first(input, index, vec_shape<value_type, 1>()), + reverse(self.argument_first(input, self.size() - (N - 1), vec_shape<value_type, N - 1>()))); } - return reverse(this->argument_first(input, this->size() - index - (N - 1), vec_t<value_type, N>())); + return reverse(self.argument_first(input, self.size() - index - (N - 1), vec_shape<value_type, N>())); } }; @@ -618,7 +195,7 @@ template <typename T> struct dft_arblen_stage_impl : dft_stage<T> { dft_arblen_stage_impl(size_t size) - : fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal), size(size) + : size(size), fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal) { this->name = type_name<decltype(*this)>(); this->radix = size; @@ -642,10 +219,9 @@ struct dft_arblen_stage_impl : dft_stage<T> DFT_STAGE_FN template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) { - const size_t n = this->size; - const size_t N2 = this->fftsize; + const size_t n = this->size; auto&& chirp = apply_conj(chirp_, cbool<inverse>); @@ -703,7 +279,7 @@ struct dft_special_stage_impl : dft_stage<T> } DFT_STAGE_FN template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) { complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size); stage1.do_execute(cbool<inverse>, scratch, in, temp); @@ -730,7 +306,7 @@ struct dft_stage_generic_impl : dft_stage<T> } protected: - virtual void do_initialize(size_t size) override final + virtual void do_initialize(size_t) override final { complex<T>* twiddle = ptr_cast<complex<T>>(this->data); CMT_LOOP_NOUNROLL @@ -746,12 +322,10 @@ protected: DFT_STAGE_FN template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) { const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); const size_t bl = this->blocks; - const size_t Nord = this->repeats; - const size_t N = Nord * this->radix; CMT_LOOP_NOUNROLL for (size_t b = 0; b < bl; b++) @@ -848,7 +422,7 @@ protected: DFT_STAGE_FN template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { cswitch(dft_radices, radices[0], [&](auto first_radix) { @@ -883,441 +457,7 @@ protected: }); } }; - -template <typename T, bool splitin, bool is_even> -struct fft_stage_impl : dft_stage<T> -{ - fft_stage_impl(size_t stage_size) - { - this->name = type_name<decltype(*this)>(); - this->radix = 4; - this->stage_size = stage_size; - this->repeats = 4; - this->recursion = true; - this->data_size = - align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); - } - -protected: - constexpr static bool prefetch = true; - constexpr static bool aligned = false; - constexpr static size_t width = fft_vector_width<T>; - - virtual void do_initialize(size_t size) override final - { - complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - initialize_twiddles<T, width>(twiddle, this->stage_size, size, true); - } - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - if (splitin) - in = out; - const size_t stg_size = this->stage_size; - CMT_ASSUME(stg_size >= 2048); - CMT_ASSUME(stg_size % 2048 == 0); - radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); - } -}; - -template <typename T, bool splitin, size_t size> -struct fft_final_stage_impl : dft_stage<T> -{ - fft_final_stage_impl(size_t) - { - this->name = type_name<decltype(*this)>(); - this->radix = size; - this->stage_size = size; - this->out_offset = size; - this->repeats = 4; - this->recursion = true; - this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment); - } - -protected: - constexpr static size_t width = fft_vector_width<T>; - constexpr static bool is_even = cometa::is_even(ilog2(size)); - constexpr static bool use_br2 = !is_even; - constexpr static bool aligned = false; - constexpr static bool prefetch = splitin; - - KFR_INTRIN void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {} - KFR_INTRIN void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {} - - template <size_t N, bool pass_splitin> - KFR_INTRIN void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>, complex<T>*& twiddle) - { - constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width; - constexpr size_t pass_width = const_min(width, N / 4); - initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_split || pass_splitin); - init_twiddles(csize<N / 4>, total_size, cbool<pass_split>, twiddle); - } - - virtual void do_initialize(size_t total_size) override final - { - complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle); - } - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle); - } - - template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)> - KFR_INTRIN void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) - { - radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } - - template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)> - KFR_INTRIN void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) - { - radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } - - template <bool inverse> - KFR_INTRIN void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) - { - radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } - - template <bool inverse> - KFR_INTRIN void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) - { - radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } - - template <bool inverse, size_t N, bool pass_splitin> - KFR_INTRIN void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, - const complex<T>* in, const complex<T>*& twiddle) - { - static_assert(N > 8, ""); - constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width; - constexpr size_t pass_width = const_min(width, N / 4); - static_assert(pass_width == width || (pass_split == pass_splitin), ""); - static_assert(pass_width <= N / 4, ""); - radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_split>, cbool_t<pass_splitin>(), - cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, - twiddle); - final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_split>, out, out, twiddle); - } -}; - -template <typename T, bool is_even> -struct fft_reorder_stage_impl : dft_stage<T> -{ - fft_reorder_stage_impl(size_t stage_size) - { - this->name = type_name<decltype(*this)>(); - this->stage_size = stage_size; - log2n = ilog2(stage_size); - this->data_size = 0; - } - -protected: - size_t log2n; - - virtual void do_initialize(size_t) override final {} - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - fft_reorder(out, log2n, cbool_t<!is_even>()); - } -}; - -template <typename T, size_t log2n> -struct fft_specialization; - -template <typename T> -struct fft_specialization<T, 1> : dft_stage<T> -{ - fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } - -protected: - constexpr static bool aligned = false; - DFT_STAGE_FN - - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - cvec<T, 1> a0, a1; - split(cread<2, aligned>(in), a0, a1); - cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1)); - } -}; - -template <typename T> -struct fft_specialization<T, 2> : dft_stage<T> -{ - fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } - -protected: - constexpr static bool aligned = false; - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - cvec<T, 1> a0, a1, a2, a3; - split(cread<4>(in), a0, a1, a2, a3); - butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3); - cwrite<4>(out, concat(a0, a1, a2, a3)); - } -}; - -template <typename T> -struct fft_specialization<T, 3> : dft_stage<T> -{ - fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } - -protected: - constexpr static bool aligned = false; - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - cvec<T, 8> v8 = cread<8, aligned>(in); - butterfly8<inverse>(v8); - cwrite<8, aligned>(out, v8); - } -}; - -template <typename T> -struct fft_specialization<T, 4> : dft_stage<T> -{ - fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } - -protected: - constexpr static bool aligned = false; - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - cvec<T, 16> v16 = cread<16, aligned>(in); - butterfly16<inverse>(v16); - cwrite<16, aligned>(out, v16); - } -}; - -template <typename T> -struct fft_specialization<T, 5> : dft_stage<T> -{ - fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } - -protected: - constexpr static bool aligned = false; - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - cvec<T, 32> v32 = cread<32, aligned>(in); - butterfly32<inverse>(v32); - cwrite<32, aligned>(out, v32); - } -}; - -template <typename T> -struct fft_specialization<T, 6> : dft_stage<T> -{ - fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } - -protected: - constexpr static bool aligned = false; - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in); - } -}; - -template <typename T> -struct fft_specialization<T, 7> : dft_stage<T> -{ - fft_specialization(size_t) - { - this->name = type_name<decltype(*this)>(); - this->stage_size = 128; - this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); - } - -protected: - constexpr static bool aligned = false; - constexpr static size_t width = platform<T>::vector_width; - constexpr static bool use_br2 = true; - constexpr static bool prefetch = false; - constexpr static bool is_double = sizeof(T) == 8; - constexpr static size_t final_size = is_double ? 8 : 32; - constexpr static size_t split_format = final_size == 8; - - virtual void do_initialize(size_t total_size) override final - { - complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - initialize_twiddles<T, width>(twiddle, 128, total_size, split_format); - initialize_twiddles<T, width>(twiddle, 32, total_size, split_format); - initialize_twiddles<T, width>(twiddle, 8, total_size, split_format); - } - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - final_pass<inverse>(csize_t<final_size>(), out, in, twiddle); - if (this->need_reorder) - fft_reorder(out, csize_t<7>()); - } - - template <bool inverse> - KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) - { - radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), - cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); - radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), - cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } - - template <bool inverse> - KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) - { - radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), - cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); - radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } -}; - -template <> -struct fft_specialization<float, 8> : dft_stage<float> -{ - fft_specialization(size_t) - { - this->name = type_name<decltype(*this)>(); - this->temp_size = sizeof(complex<float>) * 256; - } - -protected: - using T = float; - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp) - { - complex<float>* scratch = ptr_cast<complex<float>>(temp); - if (out == in) - { - butterfly16_multi_flip<0, inverse>(scratch, out); - butterfly16_multi_flip<1, inverse>(scratch, out); - butterfly16_multi_flip<2, inverse>(scratch, out); - butterfly16_multi_flip<3, inverse>(scratch, out); - - butterfly16_multi_natural<0, inverse>(out, scratch); - butterfly16_multi_natural<1, inverse>(out, scratch); - butterfly16_multi_natural<2, inverse>(out, scratch); - butterfly16_multi_natural<3, inverse>(out, scratch); - } - else - { - butterfly16_multi_flip<0, inverse>(out, in); - butterfly16_multi_flip<1, inverse>(out, in); - butterfly16_multi_flip<2, inverse>(out, in); - butterfly16_multi_flip<3, inverse>(out, in); - - butterfly16_multi_natural<0, inverse>(out, out); - butterfly16_multi_natural<1, inverse>(out, out); - butterfly16_multi_natural<2, inverse>(out, out); - butterfly16_multi_natural<3, inverse>(out, out); - } - } -}; - -template <> -struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256> -{ - using T = double; - fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size) - { - this->name = type_name<decltype(*this)>(); - } - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr); - if (this->need_reorder) - fft_reorder(out, csize_t<8>()); - } -}; - -template <typename T> -struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512> -{ - fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size) - { - this->name = type_name<decltype(*this)>(); - } - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr); - if (this->need_reorder) - fft_reorder(out, csize_t<9>()); - } -}; - -template <typename T> -struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> -{ - fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size) - { - this->name = type_name<decltype(*this)>(); - } - - DFT_STAGE_FN - template <bool inverse> - KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr); - if (this->need_reorder) - fft_reorder(out, 10, cfalse); - } -}; - -} // namespace internal - -// - -template <typename T> -template <typename Stage, typename... Args> -void dft_plan<T>::add_stage(Args... args) -{ - dft_stage<T>* stage = new Stage(args...); - stage->need_reorder = need_reorder; - this->data_size += stage->data_size; - this->temp_size += stage->temp_size; - stages.push_back(dft_stage_ptr(stage)); -} +} // namespace intrinsics template <typename T> template <bool is_final> @@ -1325,366 +465,83 @@ void dft_plan<T>::prepare_dft_stage(size_t radix, size_t iterations, size_t bloc { return cswitch( dft_radices, radix, - [&](auto radix) CMT_INLINE_LAMBDA { - add_stage<conditional<is_final, internal::dft_stage_fixed_final_impl<T, val_of(radix)>, - internal::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations, - blocks); + [this, iterations, blocks](auto radix) CMT_INLINE_LAMBDA { + add_stage<conditional<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>, + intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations, + blocks); }, - [&]() { add_stage<internal::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks); }); + [this, radix, iterations, blocks]() { + add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks); + }); } template <typename T> -template <bool is_even, bool first> -void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>) +void dft_plan<T>::init_dft(size_t size, dft_order) { - constexpr size_t final_size = is_even ? 1024 : 512; - - if (stage_size >= 2048) + if (size == 60) { - add_stage<internal::fft_stage_impl<T, !first, is_even>>(stage_size); - - make_fft(stage_size / 4, cbool_t<is_even>(), cfalse); + this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(); } - else + else if (size == 48) { - add_stage<internal::fft_final_stage_impl<T, !first, final_size>>(final_size); + this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(); } -} - -template <typename T> -struct reverse_wrapper -{ - T& iterable; -}; - -template <typename T> -auto begin(reverse_wrapper<T> w) -{ - return std::rbegin(w.iterable); -} - -template <typename T> -auto end(reverse_wrapper<T> w) -{ - return std::rend(w.iterable); -} - -template <typename T> -reverse_wrapper<T> reversed(T&& iterable) -{ - return { iterable }; -} - -template <typename T> -void dft_plan<T>::initialize() -{ - data = autofree<u8>(data_size); - size_t offset = 0; - for (dft_stage_ptr& stage : stages) - { - stage->data = data.data() + offset; - stage->initialize(this->size); - offset += stage->data_size; - } - - bool to_scratch = false; - bool scratch_needed = false; - for (dft_stage_ptr& stage : reversed(stages)) - { - if (to_scratch) - { - scratch_needed = true; - } - stage->to_scratch = to_scratch; - if (!stage->can_inplace) - { - to_scratch = !to_scratch; - } - } - if (scratch_needed || !stages[0]->can_inplace) - this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment); -} - -template <typename T> -const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in, - const complex<T>* scratch, bool in_scratch) const -{ - if (stage == 0) - return in_scratch ? scratch : in; - return stages[stage - 1]->to_scratch ? scratch : out; -} - -template <typename T> -complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const -{ - return stages[stage]->to_scratch ? scratch : out; -} - -template <typename T> -template <bool inverse> -void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const -{ - if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) - { - return stages[0]->execute(cbool<inverse>, out, in, temp); - } - size_t stack[32] = { 0 }; - - complex<T>* scratch = - ptr_cast<complex<T>>(temp + this->temp_size - - align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment)); - - bool in_scratch = !stages[0]->can_inplace && in == out; - if (in_scratch) + else { - internal::builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size); - } - - const size_t count = stages.size(); + size_t cur_size = size; + constexpr size_t radices_count = dft_radices.back() + 1; + u8 count[radices_count] = { 0 }; + int radices[32] = { 0 }; + size_t radices_size = 0; - for (size_t depth = 0; depth < count;) - { - if (stages[depth]->recursion) - { - size_t offset = 0; - size_t rdepth = depth; - size_t maxdepth = depth; - do + cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) { + while (cur_size && cur_size % val_of(radix) == 0) { - if (stack[rdepth] == stages[rdepth]->repeats) - { - stack[rdepth] = 0; - rdepth--; - } - else - { - complex<T>* rout = select_out(rdepth, out, scratch); - const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch); - stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); - offset += stages[rdepth]->out_offset; - stack[rdepth]++; - if (rdepth < count - 1 && stages[rdepth + 1]->recursion) - rdepth++; - else - maxdepth = rdepth; - } - } while (rdepth != depth); - depth = maxdepth + 1; - } - else - { - stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch), - select_in(depth, out, in, scratch, in_scratch), temp); - depth++; - } - } -} + count[val_of(radix)]++; + cur_size /= val_of(radix); + } + }); -template <typename T> -dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0) -{ - need_reorder = true; - if (is_poweroftwo(size)) - { - const size_t log2n = ilog2(size); - cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n, - [&](auto log2n) { - (void)log2n; - constexpr size_t log2nv = val_of(decltype(log2n)()); - this->add_stage<internal::fft_specialization<T, log2nv>>(size); - }, - [&]() { - cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { - this->make_fft(size, is_even, ctrue); - constexpr size_t is_evenv = val_of(decltype(is_even)()); - if (need_reorder) - this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size); - }); - }); - } -#ifndef KFR_DFT_NO_NPo2 - else - { - if (size == 60) - { - this->add_stage<internal::dft_special_stage_impl<T, 6, 10>>(); - } - else if (size == 48) + if (cur_size >= 101) { - this->add_stage<internal::dft_special_stage_impl<T, 6, 8>>(); + this->add_stage<intrinsics::dft_arblen_stage_impl<T>>(size); } else { - size_t cur_size = size; - constexpr size_t radices_count = dft_radices.back() + 1; - u8 count[radices_count] = { 0 }; - int radices[32] = { 0 }; - size_t radices_size = 0; - - cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) { - while (cur_size && cur_size % val_of(radix) == 0) - { - count[val_of(radix)]++; - cur_size /= val_of(radix); - } - }); + size_t blocks = 1; + size_t iterations = size; - if (cur_size >= 101) + for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++) { - this->add_stage<internal::dft_arblen_stage_impl<T>>(size); - } - else - { - size_t blocks = 1; - size_t iterations = size; - - for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++) - { - for (size_t i = 0; i < count[r]; i++) - { - iterations /= r; - radices[radices_size++] = r; - if (iterations == 1) - this->prepare_dft_stage(r, iterations, blocks, ctrue); - else - this->prepare_dft_stage(r, iterations, blocks, cfalse); - blocks *= r; - } - } - - if (cur_size > 1) + for (size_t i = 0; i < count[r]; i++) { - iterations /= cur_size; - radices[radices_size++] = cur_size; + iterations /= r; + radices[radices_size++] = r; if (iterations == 1) - this->prepare_dft_stage(cur_size, iterations, blocks, ctrue); + this->prepare_dft_stage(r, iterations, blocks, ctrue); else - this->prepare_dft_stage(cur_size, iterations, blocks, cfalse); + this->prepare_dft_stage(r, iterations, blocks, cfalse); + blocks *= r; } - - if (stages.size() > 2) - this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size); } - } - } -#endif - initialize(); -} - -template <typename T> -dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4) -{ - using namespace internal; - constexpr size_t width = platform<T>::vector_width * 2; - - block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) { - constexpr size_t width = val_of(decltype(w)()); - cwrite<width>(rtwiddle.data() + i, - cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2))))); - }); -} - -template <typename T> -void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const -{ - using namespace internal; - size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2 - - constexpr size_t width = platform<T>::vector_width * 2; - const cvec<T, 1> dc = cread<1>(out); - const size_t count = csize / 2; - - block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) { - i++; - constexpr size_t width = val_of(decltype(w)()); - constexpr size_t widthm1 = width - 1; - const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); - const cvec<T, width> fpk = cread<width>(out + i); - const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1))); - - const cvec<T, width> f1k = fpk + fpnk; - const cvec<T, width> f2k = fpk - fpnk; - const cvec<T, width> t = cmul(f2k, tw); - cwrite<width>(out + i, T(0.5) * (f1k + t)); - cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t)))); - }); - - { - size_t k = csize / 2; - const cvec<T, 1> fpk = cread<1>(out + k); - const cvec<T, 1> fpnk = negodd(fpk); - cwrite<1>(out + k, fpnk); - } - if (fmt == dft_pack_format::CCs) - { - cwrite<1>(out, pack(dc[0] + dc[1], 0)); - cwrite<1>(out + csize, pack(dc[0] - dc[1], 0)); - } - else - { - cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1])); - } -} - -template <typename T> -void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const -{ - using namespace internal; - - const size_t csize = this->size / 2; - - cvec<T, 1> dc; - - if (fmt == dft_pack_format::CCs) - { - dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real()); - } - else - { - dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag()); - } - - constexpr size_t width = platform<T>::vector_width * 2; - const size_t count = csize / 2; - - block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) { - i++; - constexpr size_t width = val_of(decltype(w)()); - constexpr size_t widthm1 = width - 1; - const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); - const cvec<T, width> fpk = cread<width>(in + i); - const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); - - const cvec<T, width> f1k = fpk + fpnk; - const cvec<T, width> f2k = fpk - fpnk; - const cvec<T, width> t = cmul_conj(f2k, tw); - cwrite<width>(out + i, f1k + t); - cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t))); - }); + if (cur_size > 1) + { + iterations /= cur_size; + radices[radices_size++] = cur_size; + if (iterations == 1) + this->prepare_dft_stage(cur_size, iterations, blocks, ctrue); + else + this->prepare_dft_stage(cur_size, iterations, blocks, cfalse); + } - { - size_t k = csize / 2; - const cvec<T, 1> fpk = cread<1>(in + k); - const cvec<T, 1> fpnk = 2 * negodd(fpk); - cwrite<1>(out + k, fpnk); + if (stages.size() > 2) + this->add_stage<intrinsics::dft_reorder_stage_impl<T>>(radices, radices_size); + } } - cwrite<1>(out, dc); } -template <typename T> -dft_plan<T>::~dft_plan() -{ -} - -template <typename T> -void dft_plan<T>::dump() const -{ - for (const dft_stage_ptr& s : stages) - { - s->dump(); - } -} +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dft/impl/dft-src.cpp b/include/kfr/dft/impl/dft-src.cpp @@ -24,7 +24,8 @@ See https://www.kfrlib.com for details. */ -#include "dft-impl.hpp" +#include "../dft_c.h" +#include "../fft.hpp" namespace kfr { @@ -41,27 +42,26 @@ extern "C" return reinterpret_cast<KFR_DFT_PLAN_F64*>(new kfr::dft_plan<double>(size)); } - void kfr_dft_execute_f32(KFR_DFT_PLAN_F32* plan, size_t size, float* out, const float* in, uint8_t* temp) + void kfr_dft_execute_f32(KFR_DFT_PLAN_F32* plan, size_t, float* out, const float* in, uint8_t* temp) { reinterpret_cast<kfr::dft_plan<float>*>(plan)->execute( reinterpret_cast<kfr::complex<float>*>(out), reinterpret_cast<const kfr::complex<float>*>(in), temp, kfr::cfalse); } - void kfr_dft_execute_f64(KFR_DFT_PLAN_F64* plan, size_t size, double* out, const double* in, - uint8_t* temp) + void kfr_dft_execute_f64(KFR_DFT_PLAN_F64* plan, size_t, double* out, const double* in, uint8_t* temp) { reinterpret_cast<kfr::dft_plan<double>*>(plan)->execute( reinterpret_cast<kfr::complex<double>*>(out), reinterpret_cast<const kfr::complex<double>*>(in), temp, kfr::cfalse); } - void kfr_dft_execute_inverse_f32(KFR_DFT_PLAN_F32* plan, size_t size, float* out, const float* in, + void kfr_dft_execute_inverse_f32(KFR_DFT_PLAN_F32* plan, size_t, float* out, const float* in, uint8_t* temp) { reinterpret_cast<kfr::dft_plan<float>*>(plan)->execute( reinterpret_cast<kfr::complex<float>*>(out), reinterpret_cast<const kfr::complex<float>*>(in), temp, kfr::ctrue); } - void kfr_dft_execute_inverse_f64(KFR_DFT_PLAN_F64* plan, size_t size, double* out, const double* in, + void kfr_dft_execute_inverse_f64(KFR_DFT_PLAN_F64* plan, size_t, double* out, const double* in, uint8_t* temp) { reinterpret_cast<kfr::dft_plan<double>*>(plan)->execute( @@ -89,29 +89,29 @@ extern "C" return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(new kfr::dft_plan_real<double>(size)); } - void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, const float* in, + void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) { reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute( reinterpret_cast<kfr::complex<float>*>(out), in, temp, static_cast<kfr::dft_pack_format>(pack_format)); } - void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, const double* in, + void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) { reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute( reinterpret_cast<kfr::complex<double>*>(out), in, temp, static_cast<kfr::dft_pack_format>(pack_format)); } - void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, - const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) + void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in, + uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) { reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute( out, reinterpret_cast<const kfr::complex<float>*>(in), temp, static_cast<kfr::dft_pack_format>(pack_format)); } - void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, - const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) + void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in, + uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) { reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute( out, reinterpret_cast<const kfr::complex<double>*>(in), temp, diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp @@ -29,19 +29,13 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ -template dft_plan<FLOAT>::dft_plan(size_t, dft_order); -template dft_plan<FLOAT>::~dft_plan(); -template void dft_plan<FLOAT>::dump() const; -template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out, - const kfr::complex<FLOAT>* in, kfr::u8* temp) const; -template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out, - const kfr::complex<FLOAT>* in, kfr::u8* temp) const; -template dft_plan_real<FLOAT>::dft_plan_real(size_t); -template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in, - kfr::dft_pack_format fmt) const; -template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const; - +#ifndef KFR_DFT_NO_NPo2 +template void dft_plan<FLOAT>::init_dft(size_t, dft_order); +#endif +} // namespace CMT_ARCH_NAME } // namespace kfr #endif diff --git a/include/kfr/dft/impl/fft-impl-f32.cpp b/include/kfr/dft/impl/fft-impl-f32.cpp @@ -0,0 +1,29 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#include "fft-impl.hpp" + +#define FLOAT float +#include "fft-templates.hpp" diff --git a/include/kfr/dft/impl/fft-impl-f64.cpp b/include/kfr/dft/impl/fft-impl-f64.cpp @@ -0,0 +1,29 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#include "fft-impl.hpp" + +#define FLOAT double +#include "fft-templates.hpp" diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp @@ -0,0 +1,1148 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "dft-fft.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +#endif +#if CMT_HAS_WARNING("-Wunused-lambda-capture") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture") +#endif + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4100)) + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <size_t width, bool inverse, typename T> +KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>, + const cvec<T, width>& w, const cvec<T, width>& tw) +{ + cvec<T, width> ww = w; + cvec<T, width> tw_ = tw; + cvec<T, width> b1 = ww * dupeven(tw_); + ww = swap<2>(ww); + + if (inverse) + tw_ = -(tw_); + ww = subadd(b1, ww * dupodd(tw_)); + return ww; +} + +template <size_t width, bool use_br2, bool inverse, bool aligned, typename T> +KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>, + cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in, + const complex<T>* twiddle) +{ + const size_t N4 = N / 4; + cvec<T, width> w1, w2, w3; + + cvec<T, width> sum02, sum13, diff02, diff13; + + cvec<T, width> a0, a1, a2, a3; + a0 = cread<width, aligned>(in + 0); + a2 = cread<width, aligned>(in + N4 * 2); + sum02 = a0 + a2; + + a1 = cread<width, aligned>(in + N4); + a3 = cread<width, aligned>(in + N4 * 3); + sum13 = a1 + a3; + + cwrite<width, aligned>(out, sum02 + sum13); + w2 = sum02 - sum13; + cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2), + radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2, + cread<width, true>(twiddle + width))); + diff02 = a0 - a2; + diff13 = a1 - a3; + if (inverse) + { + diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T())); + diff13 = swap<2>(diff13); + } + else + { + diff13 = swap<2>(diff13); + diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T())); + } + + w1 = diff02 + diff13; + + cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1), + radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1, + cread<width, true>(twiddle + 0))); + w3 = diff02 - diff13; + cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), + w3, cread<width, true>(twiddle + width * 2))); +} + +template <size_t width, bool inverse, typename T> +KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>, + const cvec<T, width>& w, const cvec<T, width>& tw) +{ + vec<T, width> re1, im1, twre, twim; + split(w, re1, im1); + split(tw, twre, twim); + + const vec<T, width> b1re = re1 * twre; + const vec<T, width> b1im = im1 * twre; + if (inverse) + return concat(b1re + im1 * twim, b1im - re1 * twim); + else + return concat(b1re - im1 * twim, b1im + re1 * twim); +} + +template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T> +KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>, + cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, + const complex<T>* in, const complex<T>* twiddle) +{ + const size_t N4 = N / 4; + cvec<T, width> w1, w2, w3; + constexpr bool read_split = !splitin && splitout; + constexpr bool write_split = splitin && !splitout; + + vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3; + + split(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0); + split(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1); + split(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2); + split(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3); + + const vec<T, width> sum02re = re0 + re2; + const vec<T, width> sum02im = im0 + im2; + const vec<T, width> sum13re = re1 + re3; + const vec<T, width> sum13im = im1 + im3; + + cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im)); + w2 = concat(sum02re - sum13re, sum02im - sum13im); + cwrite_split<width, aligned, write_split>( + out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2, + cread<width, true>(twiddle + width))); + + const vec<T, width> diff02re = re0 - re2; + const vec<T, width> diff02im = im0 - im2; + const vec<T, width> diff13re = re1 - re3; + const vec<T, width> diff13im = im1 - im3; + + (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); + (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); + + cwrite_split<width, aligned, write_split>( + out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1, + cread<width, true>(twiddle + 0))); + cwrite_split<width, aligned, write_split>( + out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3, + cread<width, true>(twiddle + width * 2))); +} + +template <typename T> +CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size) +{ + if (n == 0) + { + return make_vector(static_cast<T>(1), static_cast<T>(0)); + } + else if (n == size / 4) + { + return make_vector(static_cast<T>(0), static_cast<T>(-1)); + } + else if (n == size / 2) + { + return make_vector(static_cast<T>(-1), static_cast<T>(0)); + } + else if (n == size * 3 / 4) + { + return make_vector(static_cast<T>(0), static_cast<T>(1)); + } + else + { + fbase kth = c_pi<fbase, 2> * (n / static_cast<fbase>(size)); + fbase tcos = +kfr::cos(kth); + fbase tsin = -kfr::sin(kth); + return make_vector(static_cast<T>(tcos), static_cast<T>(tsin)); + } +} + +template <typename T, size_t width> +KFR_INTRINSIC void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size, + bool split_format) +{ + vec<T, 2 * width> result = T(); + CMT_LOOP_UNROLL + for (size_t i = 0; i < width; i++) + { + const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size); + result[i * 2] = r[0]; + result[i * 2 + 1] = r[1]; + } + if (split_format) + ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result); + else + ref_cast<cvec<T, width>>(twiddle[0]) = result; + twiddle += width; +} + +template <typename T, size_t width> +CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format) +{ + const size_t count = stage_size / 4; + size_t nnstep = size / stage_size; + DFT_ASSERT(width <= count); + CMT_LOOP_NOUNROLL + for (size_t n = 0; n < count; n += width) + { + initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format); + initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format); + initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format); + } +} + +#ifdef KFR_NO_PREFETCH +#define KFR_PREFETCH(addr) \ + do \ + { \ + (void)(addr); \ + } while (0) +#else + +#if defined CMT_ARCH_SSE +#ifdef CMT_COMPILER_GNU +#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0); +#else +#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0); +#endif +#else +#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr)); +#endif +#endif + +template <typename T> +KFR_INTRINSIC void prefetch_one(const complex<T>* in) +{ + KFR_PREFETCH(in); +} + +template <typename T> +KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in) +{ + KFR_PREFETCH(in); + KFR_PREFETCH(in + stride); + KFR_PREFETCH(in + stride * 2); + KFR_PREFETCH(in + stride * 3); +} + +template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, + bool inverse, bool aligned, typename T> +KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>, + cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, + cbool_t<aligned>, complex<T>* out, const complex<T>* in, + const complex<T>*& twiddle) +{ + constexpr static size_t prefetch_offset = width * 8; + const auto N4 = N / csize_t<4>(); + const auto N43 = N4 * csize_t<3>(); + CMT_ASSUME(blocks > 0); + CMT_ASSUME(N > 0); + CMT_ASSUME(N4 > 0); + DFT_ASSERT(width <= N4); + CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++) + { + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) + for (size_t n2 = 0; n2 < N4; n2 += width) + { + if (prefetch) + prefetch_four(N4, in + prefetch_offset); + radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(), + cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out, + in, twiddle + n2 * 3); + in += width; + out += width; + } + in += N43; + out += N43; + } + twiddle += N43; + return {}; +} + +template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> +KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>, + cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, + complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +{ + CMT_ASSUME(blocks > 0); + constexpr static size_t prefetch_offset = 32 * 4; + for (size_t b = 0; b < blocks; b++) + { + if (prefetch) + prefetch_four(csize_t<64>(), out + prefetch_offset); + cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; + split(cread_split<8, aligned, splitin>(out + 0), w0, w1); + split(cread_split<8, aligned, splitin>(out + 8), w2, w3); + split(cread_split<8, aligned, splitin>(out + 16), w4, w5); + split(cread_split<8, aligned, splitin>(out + 24), w6, w7); + + butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); + + w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); + w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); + w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); + w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); + w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); + w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); + w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); + + cvec<T, 8> z0, z1, z2, z3; + transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); + + butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); + cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3))); + out += 32; + } + return {}; +} + +template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> +KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, + cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, + complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +{ + CMT_ASSUME(blocks > 0); + DFT_ASSERT(2 <= blocks); + constexpr static size_t prefetch_offset = width * 16; + for (size_t b = 0; b < blocks; b += 2) + { + if (prefetch) + prefetch_one(out + prefetch_offset); + + cvec<T, 8> vlo = cread<8, aligned>(out + 0); + cvec<T, 8> vhi = cread<8, aligned>(out + 8); + butterfly8<inverse>(vlo); + butterfly8<inverse>(vhi); + vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo); + vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi); + cwrite<8, aligned>(out, vlo); + cwrite<8, aligned>(out + 8, vhi); + out += 16; + } + return {}; +} + +template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> +KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, + cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, + complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +{ + CMT_ASSUME(blocks > 0); + constexpr static size_t prefetch_offset = width * 4; + DFT_ASSERT(2 <= blocks); + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) + for (size_t b = 0; b < blocks; b += 2) + { + if (prefetch) + prefetch_one(out + prefetch_offset); + + cvec<T, 16> vlo = cread<16, aligned>(out); + cvec<T, 16> vhi = cread<16, aligned>(out + 16); + butterfly4<4, inverse>(vlo); + butterfly4<4, inverse>(vhi); + apply_twiddles4<0, 4, 4, inverse>(vlo); + apply_twiddles4<0, 4, 4, inverse>(vhi); + vlo = digitreverse4<2>(vlo); + vhi = digitreverse4<2>(vhi); + butterfly4<4, inverse>(vlo); + butterfly4<4, inverse>(vhi); + + use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo); + use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi); + out += 32; + } + return {}; +} + +template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> +KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, + cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, + complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +{ + constexpr static size_t prefetch_offset = width * 4; + CMT_ASSUME(blocks > 0); + DFT_ASSERT(4 <= blocks); + CMT_LOOP_NOUNROLL + for (size_t b = 0; b < blocks; b += 4) + { + if (prefetch) + prefetch_one(out + prefetch_offset); + + cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out); + butterfly4<4, inverse>(v16); + cdigitreverse4_write<aligned>(out, v16); + + out += 4 * 4; + } + return {}; +} + +template <typename T, bool splitin, bool is_even> +struct fft_stage_impl : dft_stage<T> +{ + fft_stage_impl(size_t stage_size) + { + this->name = type_name<decltype(*this)>(); + this->radix = 4; + this->stage_size = stage_size; + this->repeats = 4; + this->recursion = true; + this->data_size = + align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); + } + +protected: + constexpr static bool prefetch = true; + constexpr static bool aligned = false; + constexpr static size_t width = fft_vector_width<T>; + + virtual void do_initialize(size_t size) override final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddles<T, width>(twiddle, this->stage_size, size, true); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + if (splitin) + in = out; + const size_t stg_size = this->stage_size; + CMT_ASSUME(stg_size >= 2048); + CMT_ASSUME(stg_size % 2048 == 0); + radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + } +}; + +template <typename T, bool splitin, size_t size> +struct fft_final_stage_impl : dft_stage<T> +{ + fft_final_stage_impl(size_t) + { + this->name = type_name<decltype(*this)>(); + this->radix = size; + this->stage_size = size; + this->out_offset = size; + this->repeats = 4; + this->recursion = true; + this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment); + } + +protected: + constexpr static size_t width = fft_vector_width<T>; + constexpr static bool is_even = cometa::is_even(ilog2(size)); + constexpr static bool use_br2 = !is_even; + constexpr static bool aligned = false; + constexpr static bool prefetch = splitin; + + KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {} + KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {} + + template <size_t N, bool pass_splitin> + KFR_MEM_INTRINSIC void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>, + complex<T>*& twiddle) + { + constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width; + constexpr size_t pass_width = const_min(width, N / 4); + initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_split || pass_splitin); + init_twiddles(csize<N / 4>, total_size, cbool<pass_split>, twiddle); + } + + virtual void do_initialize(size_t total_size) override final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle); + } + + template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)> + KFR_MEM_INTRINSIC void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, + const complex<T>*& twiddle) + { + radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + } + + template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)> + KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, + const complex<T>*& twiddle) + { + radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + } + + template <bool inverse> + KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, + const complex<T>*& twiddle) + { + radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + } + + template <bool inverse> + KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, + const complex<T>*& twiddle) + { + radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + } + + template <bool inverse, size_t N, bool pass_splitin> + KFR_MEM_INTRINSIC void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, + const complex<T>* in, const complex<T>*& twiddle) + { + static_assert(N > 8, ""); + constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width; + constexpr size_t pass_width = const_min(width, N / 4); + static_assert(pass_width == width || (pass_split == pass_splitin), ""); + static_assert(pass_width <= N / 4, ""); + radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_split>, cbool_t<pass_splitin>(), + cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, + twiddle); + final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_split>, out, out, twiddle); + } +}; + +template <typename T, bool is_even> +struct fft_reorder_stage_impl : dft_stage<T> +{ + fft_reorder_stage_impl(size_t stage_size) + { + this->name = type_name<decltype(*this)>(); + this->stage_size = stage_size; + log2n = ilog2(stage_size); + this->data_size = 0; + } + +protected: + size_t log2n; + + virtual void do_initialize(size_t) override final {} + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*) + { + fft_reorder(out, log2n, cbool_t<!is_even>()); + } +}; + +template <typename T, size_t log2n> +struct fft_specialization; + +template <typename T> +struct fft_specialization<T, 1> : dft_stage<T> +{ + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } + +protected: + constexpr static bool aligned = false; + DFT_STAGE_FN + + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + cvec<T, 1> a0, a1; + split(cread<2, aligned>(in), a0, a1); + cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1)); + } +}; + +template <typename T> +struct fft_specialization<T, 2> : dft_stage<T> +{ + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } + +protected: + constexpr static bool aligned = false; + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + cvec<T, 1> a0, a1, a2, a3; + split(cread<4>(in), a0, a1, a2, a3); + butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3); + cwrite<4>(out, concat(a0, a1, a2, a3)); + } +}; + +template <typename T> +struct fft_specialization<T, 3> : dft_stage<T> +{ + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } + +protected: + constexpr static bool aligned = false; + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + cvec<T, 8> v8 = cread<8, aligned>(in); + butterfly8<inverse>(v8); + cwrite<8, aligned>(out, v8); + } +}; + +template <typename T> +struct fft_specialization<T, 4> : dft_stage<T> +{ + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } + +protected: + constexpr static bool aligned = false; + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + cvec<T, 16> v16 = cread<16, aligned>(in); + butterfly16<inverse>(v16); + cwrite<16, aligned>(out, v16); + } +}; + +template <typename T> +struct fft_specialization<T, 5> : dft_stage<T> +{ + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } + +protected: + constexpr static bool aligned = false; + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + cvec<T, 32> v32 = cread<32, aligned>(in); + butterfly32<inverse>(v32); + cwrite<32, aligned>(out, v32); + } +}; + +template <typename T> +struct fft_specialization<T, 6> : dft_stage<T> +{ + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } + +protected: + constexpr static bool aligned = false; + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in); + } +}; + +template <typename T> +struct fft_specialization<T, 7> : dft_stage<T> +{ + fft_specialization(size_t) + { + this->name = type_name<decltype(*this)>(); + this->stage_size = 128; + this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); + } + +protected: + constexpr static bool aligned = false; + constexpr static size_t width = vector_width<T>; + constexpr static bool use_br2 = true; + constexpr static bool prefetch = false; + constexpr static bool is_double = sizeof(T) == 8; + constexpr static size_t final_size = is_double ? 8 : 32; + constexpr static size_t split_format = final_size == 8; + + virtual void do_initialize(size_t total_size) override final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddles<T, width>(twiddle, 128, total_size, split_format); + initialize_twiddles<T, width>(twiddle, 32, total_size, split_format); + initialize_twiddles<T, width>(twiddle, 8, total_size, split_format); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + final_pass<inverse>(csize_t<final_size>(), out, in, twiddle); + if (this->need_reorder) + fft_reorder(out, csize_t<7>()); + } + + template <bool inverse> + KFR_MEM_INTRINSIC void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, + const complex<T>* twiddle) + { + radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + } + + template <bool inverse> + KFR_MEM_INTRINSIC void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, + const complex<T>* twiddle) + { + radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + } +}; + +template <> +struct fft_specialization<float, 8> : dft_stage<float> +{ + fft_specialization(size_t) + { + this->name = type_name<decltype(*this)>(); + this->temp_size = sizeof(complex<float>) * 256; + } + +protected: + using T = float; + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + complex<float>* scratch = ptr_cast<complex<float>>(temp); + if (out == in) + { + butterfly16_multi_flip<0, inverse>(scratch, out); + butterfly16_multi_flip<1, inverse>(scratch, out); + butterfly16_multi_flip<2, inverse>(scratch, out); + butterfly16_multi_flip<3, inverse>(scratch, out); + + butterfly16_multi_natural<0, inverse>(out, scratch); + butterfly16_multi_natural<1, inverse>(out, scratch); + butterfly16_multi_natural<2, inverse>(out, scratch); + butterfly16_multi_natural<3, inverse>(out, scratch); + } + else + { + butterfly16_multi_flip<0, inverse>(out, in); + butterfly16_multi_flip<1, inverse>(out, in); + butterfly16_multi_flip<2, inverse>(out, in); + butterfly16_multi_flip<3, inverse>(out, in); + + butterfly16_multi_natural<0, inverse>(out, out); + butterfly16_multi_natural<1, inverse>(out, out); + butterfly16_multi_natural<2, inverse>(out, out); + butterfly16_multi_natural<3, inverse>(out, out); + } + } +}; + +template <> +struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256> +{ + using T = double; + fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size) + { + this->name = type_name<decltype(*this)>(); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr); + if (this->need_reorder) + fft_reorder(out, csize_t<8>()); + } +}; + +template <typename T> +struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512> +{ + fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size) + { + this->name = type_name<decltype(*this)>(); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr); + if (this->need_reorder) + fft_reorder(out, csize_t<9>()); + } +}; + +template <typename T> +struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> +{ + fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size) + { + this->name = type_name<decltype(*this)>(); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr); + if (this->need_reorder) + fft_reorder(out, 10, cfalse); + } +}; + +} // namespace intrinsics + +template <typename T> +template <bool is_even, bool first> +void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>) +{ + constexpr size_t final_size = is_even ? 1024 : 512; + + if (stage_size >= 2048) + { + add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(stage_size); + + make_fft(stage_size / 4, cbool_t<is_even>(), cfalse); + } + else + { + add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(final_size); + } +} + +template <typename T> +struct reverse_wrapper +{ + T& iterable; +}; + +template <typename T> +auto begin(reverse_wrapper<T> w) +{ + return std::rbegin(w.iterable); +} + +template <typename T> +auto end(reverse_wrapper<T> w) +{ + return std::rend(w.iterable); +} + +template <typename T> +reverse_wrapper<T> reversed(T&& iterable) +{ + return { iterable }; +} + +template <typename T> +void dft_plan<T>::initialize() +{ + data = autofree<u8>(data_size); + size_t offset = 0; + for (dft_stage_ptr& stage : stages) + { + stage->data = data.data() + offset; + stage->initialize(this->size); + offset += stage->data_size; + } + + bool to_scratch = false; + bool scratch_needed = false; + for (dft_stage_ptr& stage : reversed(stages)) + { + if (to_scratch) + { + scratch_needed = true; + } + stage->to_scratch = to_scratch; + if (!stage->can_inplace) + { + to_scratch = !to_scratch; + } + } + if (scratch_needed || !stages[0]->can_inplace) + this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment); +} + +template <typename T> +const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in, + const complex<T>* scratch, bool in_scratch) const +{ + if (stage == 0) + return in_scratch ? scratch : in; + return stages[stage - 1]->to_scratch ? scratch : out; +} + +template <typename T> +complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const +{ + return stages[stage]->to_scratch ? scratch : out; +} + +template <typename T> +template <bool inverse> +void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const +{ + if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) + { + return stages[0]->execute(cbool<inverse>, out, in, temp); + } + size_t stack[32] = { 0 }; + + complex<T>* scratch = + ptr_cast<complex<T>>(temp + this->temp_size - + align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment)); + + bool in_scratch = !stages[0]->can_inplace && in == out; + if (in_scratch) + { + builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size); + } + + const size_t count = stages.size(); + + for (size_t depth = 0; depth < count;) + { + if (stages[depth]->recursion) + { + size_t offset = 0; + size_t rdepth = depth; + size_t maxdepth = depth; + do + { + if (stack[rdepth] == stages[rdepth]->repeats) + { + stack[rdepth] = 0; + rdepth--; + } + else + { + complex<T>* rout = select_out(rdepth, out, scratch); + const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch); + stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); + offset += stages[rdepth]->out_offset; + stack[rdepth]++; + if (rdepth < count - 1 && stages[rdepth + 1]->recursion) + rdepth++; + else + maxdepth = rdepth; + } + } while (rdepth != depth); + depth = maxdepth + 1; + } + else + { + stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch), + select_in(depth, out, in, scratch, in_scratch), temp); + depth++; + } + } +} + +template <typename T> +void dft_plan<T>::init_fft(size_t size, dft_order) +{ + const size_t log2n = ilog2(size); + cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n, + [&](auto log2n) { + (void)log2n; + constexpr size_t log2nv = val_of(decltype(log2n)()); + this->add_stage<intrinsics::fft_specialization<T, log2nv>>(size); + }, + [&]() { + cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { + this->make_fft(size, is_even, ctrue); + constexpr size_t is_evenv = val_of(decltype(is_even)()); + if (need_reorder) + this->add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(size); + }); + }); +} + +template <typename T> +dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0) +{ + need_reorder = true; + if (is_poweroftwo(size)) + { + init_fft(size, order); + } +#ifndef KFR_DFT_NO_NPo2 + else + { + init_dft(size, order); + } +#endif + initialize(); +} + +template <typename T> +dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4) +{ + using namespace intrinsics; + + constexpr size_t width = vector_width<T> * 2; + + block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) { + constexpr size_t width = val_of(decltype(w)()); + cwrite<width>(rtwiddle.data() + i, + cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2))))); + }); +} + +template <typename T> +void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const +{ + using namespace intrinsics; + size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2 + + constexpr size_t width = vector_width<T> * 2; + const cvec<T, 1> dc = cread<1>(out); + const size_t count = csize / 2; + + block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) { + i++; + constexpr size_t width = val_of(decltype(w)()); + constexpr size_t widthm1 = width - 1; + const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); + const cvec<T, width> fpk = cread<width>(out + i); + const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1))); + + const cvec<T, width> f1k = fpk + fpnk; + const cvec<T, width> f2k = fpk - fpnk; + const cvec<T, width> t = cmul(f2k, tw); + cwrite<width>(out + i, T(0.5) * (f1k + t)); + cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t)))); + }); + + { + size_t k = csize / 2; + const cvec<T, 1> fpk = cread<1>(out + k); + const cvec<T, 1> fpnk = negodd(fpk); + cwrite<1>(out + k, fpnk); + } + if (fmt == dft_pack_format::CCs) + { + cwrite<1>(out, pack(dc[0] + dc[1], 0)); + cwrite<1>(out + csize, pack(dc[0] - dc[1], 0)); + } + else + { + cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1])); + } +} + +template <typename T> +void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const +{ + using namespace intrinsics; + + const size_t csize = this->size / 2; + + cvec<T, 1> dc; + + if (fmt == dft_pack_format::CCs) + { + dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real()); + } + else + { + dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag()); + } + + constexpr size_t width = vector_width<T> * 2; + const size_t count = csize / 2; + + block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) { + i++; + constexpr size_t width = val_of(decltype(w)()); + constexpr size_t widthm1 = width - 1; + const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); + const cvec<T, width> fpk = cread<width>(in + i); + const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); + + const cvec<T, width> f1k = fpk + fpnk; + const cvec<T, width> f2k = fpk - fpnk; + const cvec<T, width> t = cmul_conj(f2k, tw); + cwrite<width>(out + i, f1k + t); + cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t))); + }); + + { + size_t k = csize / 2; + const cvec<T, 1> fpk = cread<1>(in + k); + const cvec<T, 1> fpnk = 2 * negodd(fpk); + cwrite<1>(out + k, fpnk); + } + cwrite<1>(out, dc); +} + +template <typename T> +dft_plan<T>::~dft_plan() +{ +} + +template <typename T> +void dft_plan<T>::dump() const +{ + for (const dft_stage_ptr& s : stages) + { + s->dump(); + } +} +} // namespace CMT_ARCH_NAME + +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp @@ -0,0 +1,50 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ + +#ifdef FLOAT +#include "../fft.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template dft_plan<FLOAT>::dft_plan(size_t, dft_order); +template void dft_plan<FLOAT>::init_fft(size_t, dft_order); +template dft_plan<FLOAT>::~dft_plan(); +template void dft_plan<FLOAT>::dump() const; +template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out, + const kfr::complex<FLOAT>* in, kfr::u8* temp) const; +template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out, + const kfr::complex<FLOAT>* in, kfr::u8* temp) const; +template dft_plan_real<FLOAT>::dft_plan_real(size_t); +template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in, + kfr::dft_pack_format fmt) const; +template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const; +} // namespace CMT_ARCH_NAME +} // namespace kfr + +#endif diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp @@ -25,40 +25,45 @@ */ #pragma once -#include "../../base/complex.hpp" -#include "../../base/constants.hpp" -#include "../../base/digitreverse.hpp" -#include "../../base/read_write.hpp" -#include "../../base/sin_cos.hpp" #include "../../base/small_buffer.hpp" #include "../../base/univector.hpp" -#include "../../base/vec.hpp" +#include "../../math/sin_cos.hpp" +#include "../../simd/complex.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/digitreverse.hpp" +#include "../../simd/read_write.hpp" +#include "../../simd/vec.hpp" #include "../../base/memory.hpp" -#include "../../data/sincos.hpp" +#include "../data/sincos.hpp" CMT_PRAGMA_MSVC(warning(push)) CMT_PRAGMA_MSVC(warning(disable : 4127)) namespace kfr { +inline namespace CMT_ARCH_NAME +{ + +template <typename T, size_t N> +using cvec = vec<T, N * 2>; -namespace internal +namespace intrinsics { template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y) { return subadd(x * dupeven(y), swap<2>(x) * dupodd(y)); } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y) +KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y) { vec<T, N> yy = resize<N>(y); return cmul_impl(x, yy); } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) { vec<T, N> xx = resize<N>(x); return cmul_impl(xx, y); @@ -66,24 +71,24 @@ CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) /// Complex Multiplication template <typename T, size_t N1, size_t N2> -CMT_INLINE vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y) +KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y) { - return internal::cmul_impl(x, y); + return intrinsics::cmul_impl(x, y); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y) { return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw) +KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw) { return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1, - const vec<T, N>& tw) +KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1, + const vec<T, N>& tw) { const vec<T, N> twr = dupeven(tw); const vec<T, N> twi = dupodd(tw); @@ -95,82 +100,79 @@ CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in out1 += sumtw - diftw; } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y) +KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y) { vec<T, N> yy = resize<N>(y); return cmul_conj(x, yy); } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y) +KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y) { vec<T, N> xx = resize<N>(x); return cmul_conj(xx, y); } -template <typename T, size_t N> -using cvec = vec<T, N * 2>; - template <size_t N, bool A = false, typename T> -CMT_INLINE cvec<T, N> cread(const complex<T>* src) +KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src) { return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); } template <size_t N, bool A = false, typename T> -CMT_INLINE void cwrite(complex<T>* dest, const cvec<T, N>& value) +KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value) { value.write(ptr_cast<T>(dest)); } template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> -CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>) +KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>) { return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...); } template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> -CMT_INLINE void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>) +KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>) { swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... }; } template <size_t count, size_t N, bool A, typename T, size_t... indices> -CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>) +KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>) { return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...); } template <size_t count, size_t N, bool A, typename T, size_t... indices> -CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value, - csizes_t<indices...>) +KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value, + csizes_t<indices...>) { swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... }; } template <size_t count, size_t N, size_t stride, bool A = false, typename T> -CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src) +KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src) { return cread_group_impl<count, N, stride, A>(src, csizeseq_t<count>()); } template <size_t count, size_t N, size_t stride, bool A = false, typename T> -CMT_INLINE void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value) +KFR_INTRINSIC void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value) { return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq_t<count>()); } template <size_t count, size_t N, bool A = false, typename T> -CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride) +KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride) { return cread_group_impl<count, N, A>(src, stride, csizeseq_t<count>()); } template <size_t count, size_t N, bool A = false, typename T> -CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value) +KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value) { return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq_t<count>()); } template <size_t N, bool A = false, bool split = false, typename T> -CMT_INLINE cvec<T, N> cread_split(const complex<T>* src) +KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src) { cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); if (split) @@ -179,7 +181,7 @@ CMT_INLINE cvec<T, N> cread_split(const complex<T>* src) } template <size_t N, bool A = false, bool split = false, typename T> -CMT_INLINE void cwrite_split(complex<T>* dest, const cvec<T, N>& value) +KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value) { cvec<T, N> v = value; if (split) @@ -262,13 +264,13 @@ inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, } template <size_t N, size_t stride, typename T, size_t... Indices> -CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>) +KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>) { return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...); } template <size_t N, size_t stride, typename T> -CMT_INLINE cvec<T, N> cgather(const complex<T>* base) +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base) { if (stride == 1) { @@ -278,7 +280,7 @@ CMT_INLINE cvec<T, N> cgather(const complex<T>* base) return cgather_helper<N, stride, T>(base, csizeseq_t<N>()); } -CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) +KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) { size_t temp = index; index += stride; @@ -286,7 +288,7 @@ CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t index -= size; return temp; } -CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t) +KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t) { size_t temp = index; index += stride; @@ -294,45 +296,45 @@ CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t) } template <size_t N, typename T, size_t... Indices> -CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, - csizes_t<Indices...>) +KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, + csizes_t<Indices...>) { return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...); } template <size_t N, typename T> -CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride) +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride) { return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); } template <size_t N, typename T> -CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride) +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride) { size_t index = 0; return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); } template <size_t N, typename T, size_t... Indices> -CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size, - csizes_t<Indices...>) +KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size, + csizes_t<Indices...>) { return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...); } template <size_t N, typename T> -CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size) +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size) { return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>()); } template <size_t N, size_t stride, typename T, size_t... Indices> -CMT_INLINE void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>) +KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>) { swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; } template <size_t N, size_t stride, typename T> -CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value) +KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value) { if (stride == 1) { @@ -345,34 +347,35 @@ CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value) } template <size_t N, typename T, size_t... Indices> -CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value, - csizes_t<Indices...>) +KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value, + csizes_t<Indices...>) { swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; } template <size_t N, typename T> -CMT_INLINE void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value) +KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value) { return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>()); } template <size_t groupsize = 1, typename T, size_t N, typename IT> -CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset) +KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset) { return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>()); } template <size_t groupsize = 1, typename T, size_t N, typename IT> -CMT_INLINE void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value) +KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value) { return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>()); } template <typename T> -KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2, - const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2, - cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, cvec<T, 4>& w7) +KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2, + const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2, + cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, + cvec<T, 4>& w7) { cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3)); cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3)); @@ -389,10 +392,10 @@ KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const c } template <typename T> -KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2, - const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5, - const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1, - cvec<T, 8>& z2, cvec<T, 8>& z3) +KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2, + const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5, + const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1, + cvec<T, 8>& z2, cvec<T, 8>& z3) { cvec<T, 16> a = concat(w0, w1, w2, w3); cvec<T, 16> b = concat(w4, w5, w6, w7); @@ -405,7 +408,7 @@ KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const c } template <typename T> -void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d) +KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d) { cvec<T, 4> a0, a1, a2, a3; cvec<T, 4> b0, b1, b2, b3; @@ -423,8 +426,8 @@ void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d) d = concat(a3, b3, c3, d3); } template <typename T> -void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa, - cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd) +KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa, + cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd) { cvec<T, 4> a0, a1, a2, a3; cvec<T, 4> b0, b1, b2, b3; @@ -443,35 +446,35 @@ void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, } template <bool b, typename T> -constexpr KFR_INTRIN T chsign(T x) +constexpr KFR_INTRINSIC T chsign(T x) { return b ? -x : x; } template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false, size_t... indices> -constexpr KFR_INTRIN cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>) +constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>) { return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start))) : cos_using_table<T>(size, (indices / 2 * step + start)))...); } template <typename T, size_t width, size_t... indices> -constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, size_t start, - size_t step) +constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, + size_t start, size_t step) { return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start) : cos_using_table<T>(size, indices / 2 * step + start))...); } template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false> -constexpr KFR_INTRIN cvec<T, width> fixed_twiddle() +constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle() { return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>()); } template <typename T, size_t width> -constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0) +constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0) { return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size); } @@ -480,7 +483,7 @@ constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, siz // constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>(); template <typename T, size_t N, bool inverse> -constexpr cvec<T, N> twiddleimagmask() +constexpr KFR_INTRINSIC cvec<T, N> twiddleimagmask() { return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1); } @@ -498,7 +501,7 @@ CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x) template <size_t k, size_t size, bool inverse = false, typename T, size_t width, size_t kk = (inverse ? size - k : k) % size> -KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x) +KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x) { constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485); if (kk == 0) @@ -540,7 +543,7 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x) } template <size_t N, typename T> -KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) +KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) { const cvec<T, N> sum = a0 + a1; const cvec<T, N> dif = a0 - a1; @@ -549,15 +552,15 @@ KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N } template <size_t N, typename T> -KFR_INTRIN void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1) +KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1) { butterfly2<N>(a0, a1, a0, a1); } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, - const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, - cvec<T, N>& w2, cvec<T, N>& w3) +KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, + const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3) { cvec<T, N> sum02, sum13, diff02, diff13; cvec<T, N * 2> a01, a23, sum0213, diff0213; @@ -589,9 +592,9 @@ KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, cons } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, - const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, - cvec<T, N>& w2, cvec<T, N>& w3) +KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, + const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3) { vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3; vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3; @@ -616,11 +619,11 @@ KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1, - cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, - cvec<T, N>& w7) +KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, + cvec<T, N>& w7) { cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6; butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6); @@ -642,14 +645,14 @@ KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7) +KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7) { butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7); } template <bool inverse = false, typename T> -KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67) +KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67) { cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67; @@ -670,7 +673,7 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv } template <bool inverse = false, typename T> -KFR_INTRIN void butterfly8(cvec<T, 8>& v8) +KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8) { cvec<T, 2> w0, w1, w2, w3; split(v8, w0, w1, w2, w3); @@ -679,7 +682,7 @@ KFR_INTRIN void butterfly8(cvec<T, 8>& v8) } template <bool inverse = false, typename T> -KFR_INTRIN void butterfly32(cvec<T, 32>& v32) +KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32) { cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; split(v32, w0, w1, w2, w3, w4, w5, w6, w7); @@ -701,7 +704,7 @@ KFR_INTRIN void butterfly32(cvec<T, 32>& v32) } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly4(cvec<T, N * 4>& a0123) +KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123) { cvec<T, N> a0; cvec<T, N> a1; @@ -713,7 +716,7 @@ KFR_INTRIN void butterfly4(cvec<T, N * 4>& a0123) } template <size_t N, typename T> -KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01) +KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01) { cvec<T, N> a0; cvec<T, N> a1; @@ -723,7 +726,7 @@ KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01) } template <size_t N, bool inverse = false, bool split_format = false, typename T> -KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) +KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) { if (split_format) { @@ -750,9 +753,9 @@ KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec< } template <size_t N, bool inverse = false, bool split_format = false, typename T> -KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3, - const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3, - cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) +KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3, + const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3, + cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) { apply_twiddle<N, inverse, split_format>(a1, tw1, w1); apply_twiddle<N, inverse, split_format>(a2, tw2, w2); @@ -760,31 +763,31 @@ KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, cons } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2, - const cvec<T, N>& tw3) +KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, + cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2, + const cvec<T, N>& tw3) { apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3); } template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]> -KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2, - const cvec<T, 1>& tw3) +KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, + cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2, + const cvec<T, 1>& tw3) { apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); } template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]> -KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2, - cvec<T, N / 2> tw3) +KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, + cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2, + cvec<T, N / 2> tw3) { apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d) +KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d) { cvec<T, 4> b0, b1, b2, b3; cvec<T, 4> c0, c1, c2, c3; @@ -812,7 +815,7 @@ KFR_INTRIN void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, c } template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T> -KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) +KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) { cvec<T, N> a0; cvec<T, N> a1; @@ -830,7 +833,7 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) } template <bool inverse, bool aligned, typename T> -KFR_INTRIN void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in) +KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in) { cvec<T, 16> w0, w1, w2, w3; @@ -886,7 +889,7 @@ KFR_INTRIN void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, } template <bool inverse = false, typename T> -KFR_INTRIN void butterfly16(cvec<T, 16>& v16) +KFR_INTRINSIC void butterfly16(cvec<T, 16>& v16) { butterfly4<4, inverse>(v16); apply_twiddles4<0, 4, 4, inverse>(v16); @@ -895,7 +898,7 @@ KFR_INTRIN void butterfly16(cvec<T, 16>& v16) } template <size_t index, bool inverse = false, typename T> -KFR_INTRIN void butterfly16_multi_natural(complex<T>* out, const complex<T>* in) +KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in) { constexpr size_t N = 4; @@ -954,7 +957,7 @@ KFR_INTRIN void butterfly16_multi_natural(complex<T>* out, const complex<T>* in) } template <size_t index, bool inverse = false, typename T> -KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) +KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) { constexpr size_t N = 4; @@ -1011,7 +1014,7 @@ KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) } template <size_t n2, size_t nnstep, size_t N, typename T> -KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1) +KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1) { cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>(); @@ -1026,8 +1029,8 @@ static const cvec<T, N> tw3i1 = static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>(); template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01, - cvec<T, N>& w02) +KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, + cvec<T, N>& w01, cvec<T, N>& w02) { const cvec<T, N> sum1 = a01 + a02; @@ -1043,15 +1046,16 @@ KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec< } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2) +KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2) { butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2); } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, cvec<T, N>& w0, - cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5) +KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, + cvec<T, N>& w5) { cvec<T, N* 2> a03 = concat(a0, a3); cvec<T, N* 2> a25 = concat(a2, a5); @@ -1073,8 +1077,8 @@ KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5) +KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5) { butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5); } @@ -1090,11 +1094,11 @@ const static cvec<T, 1> tw9_4 = { T(-0.93969262078590838405410927732473), (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) }; template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, cvec<T, N>& w0, - cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, - cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8) +KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, + cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, + cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8) { cvec<T, N* 3> a012 = concat(a0, a1, a2); cvec<T, N* 3> a345 = concat(a3, a4, a5); @@ -1121,8 +1125,8 @@ KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8) +KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8) { butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8); } @@ -1149,9 +1153,10 @@ static const cvec<T, N> tw7i3 = static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>(); template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, - cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, - cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06) +KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, + cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, + cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, + cvec<T, N>& w06) { const cvec<T, N> sum1 = a01 + a06; const cvec<T, N> dif1 = swap<2>(a01 - a06); @@ -1184,8 +1189,8 @@ KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec< } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5, cvec<T, N>& a6) +KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6) { butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6); } @@ -1226,11 +1231,11 @@ static const cvec<T, N> tw11i5 = static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>(); template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, - cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09, - cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, - cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06, - cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10) +KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, + cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09, + cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, + cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06, + cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10) { const cvec<T, N> sum1 = a01 + a10; const cvec<T, N> dif1 = swap<2>(a01 - a10); @@ -1300,9 +1305,9 @@ const static cvec<T, N> tw5i2 = static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>(); template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, - const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, - cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) +KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, + const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, + cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) { const cvec<T, N> sum1 = a01 + a04; const cvec<T, N> dif1 = swap<2>(a01 - a04); @@ -1323,12 +1328,12 @@ KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const c } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, - const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, - cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, - cvec<T, N>& w8, cvec<T, N>& w9) +KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, + const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, + cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, + cvec<T, N>& w8, cvec<T, N>& w9) { cvec<T, N* 2> a05 = concat(a0, a5); cvec<T, N* 2> a27 = concat(a2, a7); @@ -1363,91 +1368,96 @@ KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cv } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0, - vec<T, N>& out1) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0, + vec<T, N>& out1) { butterfly2<N / 2>(in0, in1, out0, out1); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2) { butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, - vec<T, N>& out3) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3) { butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, + vec<T, N>& out4) { butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, vec<T, N>& out0, - vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) { butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, - const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, - vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, + vec<T, N>& out6) { butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, - const vec<T, N>& in6, const vec<T, N>& in7, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, - vec<T, N>& out7) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, + vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7) { butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, out6, out7); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, - const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, vec<T, N>& out0, - vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, - vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, + vec<T, N>& out7, vec<T, N>& out8) { butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4, out5, out6, out7, out8); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, - const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, - const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, - vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, - vec<T, N>& out8, vec<T, N>& out9) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, + vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9) { butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, - const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, - const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, - const vec<T, N>& in9, const vec<T, N>& in10, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, - vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10) +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, + vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, + vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10) { butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10); } template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> -KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) +KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) { vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr)); if (transposed) @@ -1456,8 +1466,8 @@ KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec } // Warning: Reads past the end. Use with care -KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, cvec<f32, 4>& w1, - cvec<f32, 4>& w2) +KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, + cvec<f32, 4>& w1, cvec<f32, 4>& w2) { cvec<f32, 4> w3; cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9)); @@ -1465,8 +1475,8 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3 split(v16, w0, w1, w2, w3); } -KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, cvec<f32, 4>& w1, - cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4) +KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, + cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4) { cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15)); v16 = digitreverse4<2>(v16); @@ -1475,7 +1485,7 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3 } template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> -KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) +KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) { auto temp = concat(args...); if (transposed) @@ -1484,20 +1494,21 @@ KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N } template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> -KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle) +KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle) { return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1))); } template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> -KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle) +KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle) { return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1))); } // Non-final template <typename T, size_t width, size_t radix, bool inverse, size_t... I> -KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>, - complex<T>* out, const complex<T>* in, const complex<T>* tw, size_t stride) +KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, + cbool_t<inverse>, complex<T>* out, const complex<T>* in, + const complex<T>* tw, size_t stride) { carray<cvec<T, width>, radix> inout; @@ -1513,8 +1524,8 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize // Final template <typename T, size_t width, size_t radix, bool inverse, size_t... I> -KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>, - complex<T>* out, const complex<T>* in, size_t stride) +KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, + cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride) { carray<cvec<T, width>, radix> inout; @@ -1527,17 +1538,17 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize } template <size_t width, size_t radix, typename... Args> -KFR_INTRIN void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args) +KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args) { butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...); } template <typename... Args> -KFR_INTRIN void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...) +KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...) { } template <size_t width, typename... Args> -KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args) +KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args) { CMT_LOOP_NOUNROLL for (; i < count / width * width; i += width) @@ -1546,7 +1557,7 @@ KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&. } template <size_t width, typename... Args> -KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args) +KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args) { CMT_ASSUME(count > 0); size_t i = 0; @@ -1554,16 +1565,17 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args) } template <typename T, bool inverse, typename Tradix, typename Tstride> -KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, Tradix radix, cbool_t<inverse>, complex<T>*, - const complex<T>*, Tstride, size_t, size_t, const complex<T>*, size_t) +KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*, + const complex<T>*, Tstride, size_t, size_t, const complex<T>*, + size_t) { } template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix, typename Thalfradixsqr, typename Tstride> -KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out, - const complex<T>* in, Tstride ostride, Thalfradix halfradix, - Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i) +KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, Tstride ostride, Thalfradix halfradix, + Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i) { CMT_LOOP_NOUNROLL for (; i < halfradix / width * width; i += width) @@ -1605,19 +1617,19 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<in } template <typename T> -KFR_SINTRIN vec<T, 2> hcadd(vec<T, 2> value) +KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value) { return value; } template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)> -KFR_SINTRIN vec<T, 2> hcadd(vec<T, N> value) +KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value) { return hcadd(low(value) + high(value)); } template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>> -KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, - const complex<T>* twiddle, Tstride ostride = Tstride{}) +KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, + const complex<T>* twiddle, Tstride ostride = Tstride{}) { CMT_ASSUME(radix > 0); { @@ -1636,8 +1648,7 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* } cwrite<1>(out, hcadd(sum) + sums); } - const auto halfradix = radix / 2; - const auto halfradix_sqr = halfradix * halfradix; + const auto halfradix = radix / 2; CMT_ASSUME(halfradix > 0); size_t i = 0; @@ -1646,9 +1657,9 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* } template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>> -KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out, - const complex<T>* in, const complex<T>* twiddle, - Tstride ostride = Tstride{}) +KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, const complex<T>* twiddle, + Tstride ostride = Tstride{}) { { cvec<T, width> sum = T(); @@ -1676,16 +1687,16 @@ KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, compl } template <typename T, bool inverse, typename Tstride = csize_t<1>> -KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, - complex<T>* temp, const complex<T>* twiddle, Tstride ostride = {}) +KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, + complex<T>*, const complex<T>* twiddle, Tstride ostride = {}) { cswitch(csizes_t<11, 13>(), radix, [&](auto radix_) CMT_INLINE_LAMBDA { - constexpr size_t width = platform<T>::vector_width; + constexpr size_t width = vector_width<T>; spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride); }, [&]() CMT_INLINE_LAMBDA { - constexpr size_t width = platform<T>::vector_width; + constexpr size_t width = vector_width<T>; generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); }); } @@ -1697,25 +1708,25 @@ template <typename T, size_t N> constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T()); template <bool A = false, typename T, size_t N> -KFR_INTRIN void cbitreverse_write(complex<T>* dest, const vec<T, N>& x) +KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x) { cwrite<N / 2, A>(dest, bitreverse<2>(x)); } template <bool A = false, typename T, size_t N> -KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x) +KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x) { cwrite<N / 2, A>(dest, digitreverse4<2>(x)); } template <size_t N, bool A = false, typename T> -KFR_INTRIN cvec<T, N> cbitreverse_read(const complex<T>* src) +KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src) { return bitreverse<2>(cread<N, A>(src)); } template <size_t N, bool A = false, typename T> -KFR_INTRIN cvec<T, N> cdigitreverse4_read(const complex<T>* src) +KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src) { return digitreverse4<2>(cread<N, A>(src)); } @@ -1723,7 +1734,7 @@ KFR_INTRIN cvec<T, N> cdigitreverse4_read(const complex<T>* src) #if 1 template <> -KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src) +KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src) { return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12), cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13), @@ -1731,7 +1742,7 @@ KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15)); } template <> -KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x) +KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x) { cwrite<1>(dest, part<16, 0>(x)); cwrite<1>(dest + 4, part<16, 1>(x)); @@ -1754,7 +1765,8 @@ KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const v cwrite<1>(dest + 15, part<16, 15>(x)); } #endif -} // namespace internal +} // namespace intrinsics +} // namespace CMT_ARCH_NAME } // namespace kfr CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/dft/reference_dft.hpp b/include/kfr/dft/reference_dft.hpp @@ -25,13 +25,13 @@ */ #pragma once -#include "../base/complex.hpp" -#include "../base/constants.hpp" #include "../base/memory.hpp" -#include "../base/read_write.hpp" #include "../base/small_buffer.hpp" #include "../base/univector.hpp" -#include "../base/vec.hpp" +#include "../simd/complex.hpp" +#include "../simd/constants.hpp" +#include "../simd/read_write.hpp" +#include "../simd/vec.hpp" #include <cmath> #include <vector> diff --git a/include/kfr/dsp.hpp b/include/kfr/dsp.hpp @@ -33,7 +33,6 @@ #include "dsp/fir_design.hpp" #include "dsp/fracdelay.hpp" #include "dsp/goertzel.hpp" -#include "dsp/interpolation.hpp" #include "dsp/mixdown.hpp" #include "dsp/oscillators.hpp" #include "dsp/sample_rate_conversion.hpp" diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup biquad * @{ */ /* @@ -26,13 +26,16 @@ #pragma once #include "../base/filter.hpp" -#include "../base/function.hpp" -#include "../base/operators.hpp" #include "../base/pointer.hpp" -#include "../base/vec.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/operators.hpp" +#include "../simd/vec.hpp" +#include "../testo/assert.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ enum class biquad_type { @@ -53,17 +56,24 @@ template <typename T> struct biquad_params { template <typename U> - constexpr biquad_params(const biquad_params<U>& bq) noexcept - : a0(static_cast<T>(bq.a0)), a1(static_cast<T>(bq.a1)), a2(static_cast<T>(bq.a2)), - b0(static_cast<T>(bq.b0)), b1(static_cast<T>(bq.b1)), b2(static_cast<T>(bq.b2)) + constexpr biquad_params(const biquad_params<U>& bq) CMT_NOEXCEPT : a0(static_cast<T>(bq.a0)), + a1(static_cast<T>(bq.a1)), + a2(static_cast<T>(bq.a2)), + b0(static_cast<T>(bq.b0)), + b1(static_cast<T>(bq.b1)), + b2(static_cast<T>(bq.b2)) { } constexpr static bool is_pod = true; static_assert(std::is_floating_point<T>::value, "T must be a floating point type"); - constexpr biquad_params() noexcept : a0(1), a1(0), a2(0), b0(1), b1(0), b2(0) {} - constexpr biquad_params(T a0, T a1, T a2, T b0, T b1, T b2) noexcept - : a0(a0), a1(a1), a2(a2), b0(b0), b1(b1), b2(b2) + constexpr biquad_params() CMT_NOEXCEPT : a0(1), a1(0), a2(0), b0(1), b1(0), b2(0) {} + constexpr biquad_params(T a0, T a1, T a2, T b0, T b1, T b2) CMT_NOEXCEPT : a0(a0), + a1(a1), + a2(a2), + b0(b0), + b1(b1), + b2(b2) { } T a0; @@ -90,7 +100,7 @@ struct biquad_state vec<T, filters> s1; vec<T, filters> s2; vec<T, filters> out; - constexpr biquad_state() noexcept : s1(0), s2(0), out(0) {} + constexpr biquad_state() CMT_NOEXCEPT : s1(0), s2(0), out(0) {} }; template <typename T, size_t filters, KFR_ARCH_DEP> @@ -102,8 +112,8 @@ struct biquad_block vec<T, filters> b1; vec<T, filters> b2; - constexpr biquad_block() noexcept : a1(0), a2(0), b0(1), b1(0), b2(0) {} - CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) noexcept + constexpr biquad_block() CMT_NOEXCEPT : a1(0), a2(0), b0(1), b1(0), b2(0) {} + CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) CMT_NOEXCEPT { count = count > filters ? filters : count; for (size_t i = 0; i < count; i++) @@ -125,38 +135,40 @@ struct biquad_block } template <size_t count> - constexpr biquad_block(const biquad_params<T> (&bq)[count]) noexcept : biquad_block(bq, count) + constexpr biquad_block(const biquad_params<T> (&bq)[count]) CMT_NOEXCEPT : biquad_block(bq, count) { static_assert(count <= filters, "count > filters"); } }; template <size_t filters, typename T, typename E1, KFR_ARCH_DEP> -struct expression_biquads_l : public expression_base<E1> +struct expression_biquads_l : public expression_with_arguments<E1> { using value_type = T; expression_biquads_l(const biquad_block<T, filters>& bq, E1&& e1) - : expression_base<E1>(std::forward<E1>(e1)), bq(bq) + : expression_with_arguments<E1>(std::forward<E1>(e1)), bq(bq) { } template <size_t width> - KFR_INTRIN vec<T, width> operator()(cinput_t cinput, size_t index, vec_t<T, width> t) const + friend KFR_INTRINSIC vec<T, width> get_elements(const expression_biquads_l& self, cinput_t cinput, size_t index, + vec_shape<T, width> t) { - const vec<T, width> in = this->argument_first(cinput, index, t); + const vec<T, width> in = self.argument_first(cinput, index, t); vec<T, width> out; CMT_LOOP_UNROLL for (size_t i = 0; i < width; i++) { - state.out = process(bq, state, insertleft(in[i], state.out)); - out[i] = state.out[filters - 1]; + self.state.out = process(self.bq, self.state, insertleft(in[i], self.state.out)); + out[i] = self.state.out[filters - 1]; } return out; } - KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state, - const vec<T, filters>& in) + static KFR_MEM_INTRINSIC vec<T, filters> process(const biquad_block<T, filters>& bq, + biquad_state<T, filters>& state, + const vec<T, filters>& in) { const vec<T, filters> out = bq.b0 * in + state.s1; state.s1 = state.s2 + bq.b1 * in - bq.a1 * out; @@ -168,73 +180,74 @@ struct expression_biquads_l : public expression_base<E1> }; template <size_t filters, typename T, typename E1, KFR_ARCH_DEP> -struct expression_biquads : expression_base<E1> +struct expression_biquads : expression_with_arguments<E1> { using value_type = T; expression_biquads(const biquad_block<T, filters>& bq, E1&& e1) - : expression_base<E1>(std::forward<E1>(e1)), bq(bq), block_end(0) + : expression_with_arguments<E1>(std::forward<E1>(e1)), bq(bq), block_end(0) { } - CMT_INLINE void begin_block(cinput_t cinput, size_t size) const + void begin_block(cinput_t cinput, size_t size) const { block_end = size; for (size_t i = 0; i < filters - 1; i++) { - const vec<T, 1> in = i < size ? this->argument_first(cinput, i, vec_t<T, 1>()) : 0; + const vec<T, 1> in = i < size ? this->argument_first(cinput, i, vec_shape<T, 1>()) : 0; state.out = process(bq, state, insertleft(in[0], state.out)); } } - CMT_INLINE void end_block(cinput_t cinput, size_t) const { state = saved_state; } + void end_block(cinput_t, size_t) const { state = saved_state; } template <size_t width> - KFR_INTRIN vec<T, width> operator()(cinput_t cinput, size_t index, vec_t<T, width> t) const + friend KFR_INTRINSIC vec<T, width> get_elements(const expression_biquads& self, cinput_t cinput, size_t index, + vec_shape<T, width> t) { index += filters - 1; - vec<T, width> out; - if (index + width <= block_end) + vec<T, width> out{}; + if (index + width <= self.block_end) { - const vec<T, width> in = this->argument_first(cinput, index, t); + const vec<T, width> in = self.argument_first(cinput, index, t); CMT_LOOP_UNROLL for (size_t i = 0; i < width; i++) { - state.out = process(bq, state, insertleft(in[i], state.out)); - out[i] = state.out[filters - 1]; + self.state.out = process(self.bq, self.state, insertleft(in[i], self.state.out)); + out[i] = self.state.out[filters - 1]; } - if (index + width == block_end) - saved_state = state; + if (index + width == self.block_end) + self.saved_state = self.state; } - else if (index >= block_end) + else if (index >= self.block_end) { CMT_LOOP_UNROLL for (size_t i = 0; i < width; i++) { - state.out = process(bq, state, insertleft(T(0), state.out)); - out[i] = state.out[filters - 1]; + self.state.out = process(self.bq, self.state, insertleft(T(0), self.state.out)); + out[i] = self.state.out[filters - 1]; } } else { size_t i = 0; - for (; i < std::min(width, block_end - index); i++) + for (; i < std::min(width, self.block_end - index); i++) { - const vec<T, 1> in = this->argument_first(cinput, index + i, vec_t<T, 1>()); - state.out = process(bq, state, insertleft(in[0], state.out)); - out[i] = state.out[filters - 1]; + const vec<T, 1> in = self.argument_first(cinput, index + i, vec_shape<T, 1>()); + self.state.out = process(self.bq, self.state, insertleft(in[0], self.state.out)); + out[i] = self.state.out[filters - 1]; } - saved_state = state; + self.saved_state = self.state; for (; i < width; i++) { - state.out = process(bq, state, insertleft(T(0), state.out)); - out[i] = state.out[filters - 1]; + self.state.out = process(self.bq, self.state, insertleft(T(0), self.state.out)); + out[i] = self.state.out[filters - 1]; } } return out; } - KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state, - vec<T, filters> in) + static KFR_MEM_INTRINSIC vec<T, filters> process(const biquad_block<T, filters>& bq, + biquad_state<T, filters>& state, vec<T, filters> in) { const vec<T, filters> out = bq.b0 * in + state.s1; state.s1 = state.s2 + bq.b1 * in - bq.a1 * out; @@ -255,7 +268,7 @@ struct expression_biquads : expression_base<E1> * @param e1 Input expression */ template <typename T, typename E1> -CMT_INLINE internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& bq, E1&& e1) +KFR_FUNCTION internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& bq, E1&& e1) { const biquad_params<T> bqs[1] = { bq }; return internal::expression_biquads<1, T, E1>(bqs, std::forward<E1>(e1)); @@ -268,8 +281,8 @@ CMT_INLINE internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& * @note This implementation introduces delay of N - 1 samples, where N is the filter count. */ template <size_t filters, typename T, typename E1> -CMT_INLINE internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_params<T> (&bq)[filters], - E1&& e1) +KFR_FUNCTION internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_params<T> (&bq)[filters], + E1&& e1) { return internal::expression_biquads_l<filters, T, E1>(bq, std::forward<E1>(e1)); } @@ -281,7 +294,8 @@ CMT_INLINE internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_ * @note This implementation has zero latency */ template <size_t filters, typename T, typename E1> -CMT_INLINE internal::expression_biquads<filters, T, E1> biquad(const biquad_params<T> (&bq)[filters], E1&& e1) +KFR_FUNCTION internal::expression_biquads<filters, T, E1> biquad(const biquad_params<T> (&bq)[filters], + E1&& e1) { return internal::expression_biquads<filters, T, E1>(bq, std::forward<E1>(e1)); } @@ -292,10 +306,11 @@ CMT_INLINE internal::expression_biquads<filters, T, E1> biquad(const biquad_para * @param e1 Input expression * @note This implementation has zero latency */ -template <typename T, typename E1> -CMT_INLINE expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count, E1&& e1) +template <size_t maxfiltercount = 4, typename T, typename E1> +KFR_FUNCTION expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count, E1&& e1) { - return cswitch(csizes_t<1, 2, 4, 8, 16, 32, 64>(), next_poweroftwo(count), + constexpr csizes_t<1, 2, 4, 8, 16, 32, 64> sizes; + return cswitch(cfilter(sizes, sizes <= csize_t<maxfiltercount>{}), next_poweroftwo(count), [&](auto x) { constexpr size_t filters = x; return to_pointer(internal::expression_biquads<filters, T, E1>( @@ -304,12 +319,12 @@ CMT_INLINE expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count [&] { return to_pointer(zeros<T>()); }); } -template <typename T> +template <typename T, size_t maxfiltercount = 4> class biquad_filter : public expression_filter<T> { public: biquad_filter(const biquad_params<T>* bq, size_t count) - : expression_filter<T>(biquad(bq, count, placeholder<T>())) + : expression_filter<T>(biquad<maxfiltercount>(bq, count, placeholder<T>())) { } @@ -318,4 +333,5 @@ public: { } }; +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/biquad_design.hpp b/include/kfr/dsp/biquad_design.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup biquad * @{ */ /* @@ -30,6 +30,8 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ /** * @brief Calculates coefficients for the all-pass biquad filter @@ -38,7 +40,7 @@ namespace kfr * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q) +KFR_FUNCTION biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q) { const T alpha = std::sin(frequency) / 2.0 * Q; const T cs = std::cos(frequency); @@ -59,7 +61,7 @@ biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q) * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q) +KFR_FUNCTION biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q) { const T K = std::tan(c_pi<T, 1> * frequency); const T K2 = K * K; @@ -79,7 +81,7 @@ biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q) * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q) +KFR_FUNCTION biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q) { const T K = std::tan(c_pi<T, 1> * frequency); const T K2 = K * K; @@ -99,7 +101,7 @@ biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q) * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q) +KFR_FUNCTION biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q) { const T K = std::tan(c_pi<T, 1> * frequency); const T K2 = K * K; @@ -119,7 +121,7 @@ biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q) * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q) +KFR_FUNCTION biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q) { const T K = std::tan(c_pi<T, 1> * frequency); const T K2 = K * K; @@ -140,7 +142,7 @@ biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q) * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain) +KFR_FUNCTION biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain) { biquad_params<T> result; const T K = std::tan(c_pi<T, 1> * frequency); @@ -177,7 +179,7 @@ biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> g * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain) +KFR_FUNCTION biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain) { biquad_params<T> result; const T K = std::tan(c_pi<T, 1> * frequency); @@ -214,7 +216,7 @@ biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain) * @return Biquad filter coefficients */ template <typename T = fbase> -biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain) +KFR_FUNCTION biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain) { biquad_params<T> result; const T K = std::tan(c_pi<T, 1> * frequency); @@ -243,4 +245,5 @@ biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain) } return result; } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/dcremove.hpp b/include/kfr/dsp/dcremove.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup biquad * @{ */ /* @@ -30,11 +30,14 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ template <typename E1, typename T = flt_type<value_type_of<E1>>> -CMT_INLINE internal::expression_biquads<1, T, E1> dcremove(E1&& e1, double cutoff = 0.00025) +KFR_INTRINSIC internal::expression_biquads<1, T, E1> dcremove(E1&& e1, double cutoff = 0.00025) { const biquad_params<T> bqs[1] = { biquad_highpass(cutoff, 0.5) }; return internal::expression_biquads<1, T, E1>(bqs, std::forward<E1>(e1)); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/delay.hpp b/include/kfr/dsp/delay.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup fir * @{ */ /* @@ -30,43 +30,48 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ namespace internal { template <size_t delay, typename E> -struct expression_delay : expression_base<E> +struct expression_delay : expression_with_arguments<E> { using value_type = value_type_of<E>; using T = value_type; - using expression_base<E>::expression_base; + using expression_with_arguments<E>::expression_with_arguments; template <size_t N, KFR_ENABLE_IF(N <= delay)> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index, + vec_shape<T, N>) { vec<T, N> out; - size_t c = cursor; - data.ringbuf_read(c, out); - const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>()); - data.ringbuf_write(cursor, in); + size_t c = self.cursor; + self.data.ringbuf_read(c, out); + const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>()); + self.data.ringbuf_write(self.cursor, in); return out; } - vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const + friend vec<T, 1> get_elements(const expression_delay& self, cinput_t cinput, size_t index, + vec_shape<T, 1>) { T out; - size_t c = cursor; - data.ringbuf_read(c, out); - const T in = this->argument_first(cinput, index, vec_t<T, 1>())[0]; - data.ringbuf_write(cursor, in); + size_t c = self.cursor; + self.data.ringbuf_read(c, out); + const T in = self.argument_first(cinput, index, vec_shape<T, 1>())[0]; + self.data.ringbuf_write(self.cursor, in); return out; } template <size_t N, KFR_ENABLE_IF(N > delay)> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + friend vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index, + vec_shape<T, N>) { vec<T, delay> out; - size_t c = cursor; - data.ringbuf_read(c, out); - const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>()); - data.ringbuf_write(cursor, slice<N - delay, delay>(in)); + size_t c = self.cursor; + self.data.ringbuf_read(c, out); + const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>()); + self.data.ringbuf_write(self.cursor, slice<N - delay, delay>(in)); return concat_and_slice<0, N>(out, in); } @@ -75,18 +80,19 @@ struct expression_delay : expression_base<E> }; template <typename E> -struct expression_delay<1, E> : expression_base<E> +struct expression_delay<1, E> : expression_with_arguments<E> { using value_type = value_type_of<E>; using T = value_type; - using expression_base<E>::expression_base; + using expression_with_arguments<E>::expression_with_arguments; template <size_t N> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index, + vec_shape<T, N>) { - const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>()); - const vec<T, N> out = insertleft(data, in); - data = in[N - 1]; + const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>()); + const vec<T, N> out = insertleft(self.data, in); + self.data = in[N - 1]; return out; } mutable value_type data = value_type(0); @@ -103,9 +109,10 @@ struct expression_delay<1, E> : expression_base<E> * @endcode */ template <size_t samples = 1, typename E1> -CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>()) +KFR_INTRINSIC internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>()) { static_assert(samples >= 1 && samples < 1024, ""); return internal::expression_delay<samples, E1>(std::forward<E1>(e1)); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/ebu.hpp b/include/kfr/dsp/ebu.hpp @@ -1,3 +1,28 @@ +/** @addtogroup ebu + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ #pragma once #include <vector> @@ -16,15 +41,17 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Winaccessible-base") namespace kfr { +inline namespace CMT_ARCH_NAME +{ template <typename T> -KFR_SINTRIN T energy_to_loudness(T energy) +KFR_INTRINSIC T energy_to_loudness(T energy) { return T(10) * log10(energy) - T(0.691); } template <typename T> -KFR_SINTRIN T loudness_to_energy(T loudness) +KFR_INTRINSIC T loudness_to_energy(T loudness) { return exp10((loudness + T(0.691)) * T(0.1)); } @@ -88,8 +115,8 @@ public: } private: - mutable bool m_integrated_cached; mutable T m_integrated; + mutable bool m_integrated_cached; }; template <typename T> @@ -98,10 +125,10 @@ struct lra_vec : public univector<T> private: void compute() const { - m_range_high = -70.0; - m_range_low = -70.0; - static const T PRC_LOW = 0.10; - static const T PRC_HIGH = 0.95; + m_range_high = -70; + m_range_low = -70; + static const T PRC_LOW = T(0.10); + static const T PRC_HIGH = T(0.95); const T z_total = mean(*this); const T relative_gate = energy_to_loudness(z_total) - 20; @@ -151,13 +178,13 @@ public: } private: - mutable bool m_lra_cached; mutable T m_range_low; mutable T m_range_high; + mutable bool m_lra_cached; }; template <typename T> -KFR_SINTRIN expression_pointer<T> make_kfilter(int samplerate) +KFR_INTRINSIC expression_pointer<T> make_kfilter(int samplerate) { const biquad_params<T> bq[] = { biquad_highshelf(T(1681.81 / samplerate), T(+4.0)), @@ -199,8 +226,8 @@ public: void reset() { - std::fill(m_short_sum_of_squares.begin(), m_short_sum_of_squares.end(), 0); - std::fill(m_momentary_sum_of_squares.begin(), m_momentary_sum_of_squares.end(), 0); + std::fill(m_short_sum_of_squares.begin(), m_short_sum_of_squares.end(), T(0)); + std::fill(m_momentary_sum_of_squares.begin(), m_momentary_sum_of_squares.end(), T(0)); } void process_packet(const T* src) @@ -214,15 +241,15 @@ public: Speaker get_speaker() const { return m_speaker; } private: + const int m_sample_rate; const Speaker m_speaker; const T m_input_gain; - const int m_sample_rate; const size_t m_packet_size; expression_pointer<T> m_kfilter; - T m_output_energy_gain; - univector<T> m_buffer; univector<T> m_short_sum_of_squares; univector<T> m_momentary_sum_of_squares; + T m_output_energy_gain; + univector<T> m_buffer; size_t m_buffer_cursor; size_t m_short_sum_of_squares_cursor; size_t m_momentary_sum_of_squares_cursor; @@ -239,7 +266,7 @@ public: { for (Speaker sp : channels) { - m_channels.emplace_back(sample_rate, sp, packet_size_factor, 1); + m_channels.emplace_back(sample_rate, sp, packet_size_factor, T(1)); } } @@ -327,6 +354,7 @@ private: lra_vec<T> m_lra_buffer; }; +} // namespace CMT_ARCH_NAME } // namespace kfr CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup fir * @{ */ /* @@ -30,10 +30,12 @@ #include "../base/memory.hpp" #include "../base/reduce.hpp" #include "../base/univector.hpp" -#include "../base/vec.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ template <typename T, size_t Size> using fir_taps = univector<T, Size>; @@ -77,7 +79,7 @@ struct state_holder state_holder() = delete; state_holder(const state_holder&) = default; state_holder(state_holder&&) = default; - constexpr state_holder(const T& state) noexcept : s(state) {} + constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {} T s; }; @@ -87,30 +89,32 @@ struct state_holder<T, true> state_holder() = delete; state_holder(const state_holder&) = default; state_holder(state_holder&&) = default; - constexpr state_holder(const T& state) noexcept : s(state) {} + constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {} const T& s; }; template <size_t tapcount, typename T, typename U, typename E1, bool stateless = false, KFR_ARCH_DEP> -struct expression_short_fir : expression_base<E1> +struct expression_short_fir : expression_with_arguments<E1> { using value_type = U; expression_short_fir(E1&& e1, const short_fir_state<tapcount, T, U>& state) - : expression_base<E1>(std::forward<E1>(e1)), state(state) + : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state) { } template <size_t N> - CMT_INLINE vec<U, N> operator()(cinput_t cinput, size_t index, vec_t<U, N> x) const + KFR_INTRINSIC friend vec<U, N> get_elements(const expression_short_fir& self, cinput_t cinput, + size_t index, vec_shape<U, N> x) { - vec<U, N> in = this->argument_first(cinput, index, x); + vec<U, N> in = self.argument_first(cinput, index, x); - vec<U, N> out = in * state.s.taps[0]; - cforeach(csizeseq_t<tapcount - 1, 1>(), [&](auto I) { - out = out + concat_and_slice<tapcount - 1 - I, N>(state.s.delayline, in) * state.s.taps[I]; + vec<U, N> out = in * self.state.s.taps.front(); + cforeach(csizeseq<tapcount - 1, 1>, [&](auto I) { + out = out + + concat_and_slice<tapcount - 1 - I, N>(self.state.s.delayline, in) * self.state.s.taps[I]; }); - state.s.delayline = concat_and_slice<N, tapcount - 1>(state.s.delayline, in); + self.state.s.delayline = concat_and_slice<N, tapcount - 1>(self.state.s.delayline, in); return out; } @@ -118,31 +122,33 @@ struct expression_short_fir : expression_base<E1> }; template <typename T, typename U, typename E1, bool stateless = false, KFR_ARCH_DEP> -struct expression_fir : expression_base<E1> +struct expression_fir : expression_with_arguments<E1> { using value_type = U; expression_fir(E1&& e1, const fir_state<T, U>& state) - : expression_base<E1>(std::forward<E1>(e1)), state(state) + : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state) { } template <size_t N> - CMT_INLINE vec<U, N> operator()(cinput_t cinput, size_t index, vec_t<U, N> x) const + KFR_INTRINSIC friend vec<U, N> get_elements(const expression_fir& self, cinput_t cinput, size_t index, + vec_shape<U, N> x) { - const size_t tapcount = state.s.taps.size(); - const vec<U, N> input = this->argument_first(cinput, index, x); + const size_t tapcount = self.state.s.taps.size(); + const vec<U, N> input = self.argument_first(cinput, index, x); vec<U, N> output; - size_t cursor = state.s.delayline_cursor; + size_t cursor = self.state.s.delayline_cursor; CMT_LOOP_NOUNROLL for (size_t i = 0; i < N; i++) { - state.s.delayline.ringbuf_write(cursor, input[i]); - output[i] = dotproduct(state.s.taps, state.s.delayline.slice(cursor) /*, tapcount - cursor*/) + - dotproduct(state.s.taps.slice(tapcount - cursor), state.s.delayline /*, cursor*/); + self.state.s.delayline.ringbuf_write(cursor, input[i]); + output[i] = + dotproduct(self.state.s.taps, self.state.s.delayline.slice(cursor) /*, tapcount - cursor*/) + + dotproduct(self.state.s.taps.slice(tapcount - cursor), self.state.s.delayline /*, cursor*/); } - state.s.delayline_cursor = cursor; + self.state.s.delayline_cursor = cursor; return output; } state_holder<fir_state<T, U>, stateless> state; @@ -155,7 +161,7 @@ struct expression_fir : expression_base<E1> * @param taps coefficients for the FIR filter */ template <typename T, typename E1, univector_tag Tag> -CMT_INLINE internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const univector<T, Tag>& taps) +KFR_INTRINSIC internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const univector<T, Tag>& taps) { return internal::expression_fir<T, value_type_of<E1>, E1>(std::forward<E1>(e1), taps.ref()); } @@ -166,7 +172,7 @@ CMT_INLINE internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const * @param e1 an input expression */ template <typename T, typename U, typename E1> -CMT_INLINE internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, E1&& e1) +KFR_INTRINSIC internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, E1&& e1) { return internal::expression_fir<T, U, E1, true>(std::forward<E1>(e1), state); } @@ -178,7 +184,7 @@ CMT_INLINE internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, * @param taps coefficients for the FIR filter */ template <typename T, size_t TapCount, typename E1> -CMT_INLINE internal::expression_short_fir<next_poweroftwo(TapCount), T, value_type_of<E1>, E1> short_fir( +KFR_INTRINSIC internal::expression_short_fir<next_poweroftwo(TapCount), T, value_type_of<E1>, E1> short_fir( E1&& e1, const univector<T, TapCount>& taps) { static_assert(TapCount >= 2 && TapCount <= 32, "Use short_fir only for small FIR filters"); @@ -214,4 +220,5 @@ protected: private: fir_state<T, U> state; }; +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/fir_design.hpp b/include/kfr/dsp/fir_design.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup fir * @{ */ /* @@ -25,13 +25,15 @@ */ #pragma once -#include "../base/sin_cos.hpp" +#include "../math/sin_cos.hpp" #include "fir.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ -namespace intrinsics +namespace internal { template <typename T> void fir_lowpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window, bool normalize = true) @@ -115,11 +117,11 @@ void fir_bandstop(univector_ref<T> taps, T frequency1, T frequency2, const expre taps = taps * invsum; } } -} // namespace intrinsics -KFR_I_FN(fir_lowpass) -KFR_I_FN(fir_highpass) -KFR_I_FN(fir_bandpass) -KFR_I_FN(fir_bandstop) +} // namespace internal +KFR_I_FN_FULL(fir_lowpass, internal::fir_lowpass) +KFR_I_FN_FULL(fir_highpass, internal::fir_highpass) +KFR_I_FN_FULL(fir_bandpass, internal::fir_bandpass) +KFR_I_FN_FULL(fir_bandstop, internal::fir_bandstop) /** * @brief Calculates coefficients for the low-pass FIR filter @@ -129,10 +131,10 @@ KFR_I_FN(fir_bandstop) * @param normalize true for normalized coefficients */ template <typename T, univector_tag Tag> -CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window, - bool normalize = true) +KFR_INTRINSIC void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_lowpass(taps.slice(), cutoff, window, normalize); + return internal::fir_lowpass(taps.slice(), cutoff, window, normalize); } /** @@ -143,10 +145,10 @@ CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const e * @param normalize true for normalized coefficients */ template <typename T, univector_tag Tag> -CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window, - bool normalize = true) +KFR_INTRINSIC void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_highpass(taps.slice(), cutoff, window, normalize); + return internal::fir_highpass(taps.slice(), cutoff, window, normalize); } /** @@ -158,10 +160,10 @@ CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const * @param normalize true for normalized coefficients */ template <typename T, univector_tag Tag> -CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2, - const expression_pointer<T>& window, bool normalize = true) +KFR_INTRINSIC void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize); + return internal::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize); } /** @@ -173,49 +175,50 @@ CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, id * @param normalize true for normalized coefficients */ template <typename T, univector_tag Tag> -CMT_INLINE void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2, - const expression_pointer<T>& window, bool normalize = true) +KFR_INTRINSIC void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize); + return internal::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize); } /** * @copydoc kfr::fir_lowpass */ template <typename T> -CMT_INLINE void fir_lowpass(const univector_ref<T>& taps, identity<T> cutoff, - const expression_pointer<T>& window, bool normalize = true) +KFR_INTRINSIC void fir_lowpass(const univector_ref<T>& taps, identity<T> cutoff, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_lowpass(taps, cutoff, window, normalize); + return internal::fir_lowpass(taps, cutoff, window, normalize); } /** * @copydoc kfr::fir_highpass */ template <typename T> -CMT_INLINE void fir_highpass(const univector_ref<T>& taps, identity<T> cutoff, - const expression_pointer<T>& window, bool normalize = true) +KFR_INTRINSIC void fir_highpass(const univector_ref<T>& taps, identity<T> cutoff, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_highpass(taps, cutoff, window, normalize); + return internal::fir_highpass(taps, cutoff, window, normalize); } /** * @copydoc kfr::fir_bandpass */ template <typename T> -CMT_INLINE void fir_bandpass(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2, - const expression_pointer<T>& window, bool normalize = true) +KFR_INTRINSIC void fir_bandpass(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_bandpass(taps, frequency1, frequency2, window, normalize); + return internal::fir_bandpass(taps, frequency1, frequency2, window, normalize); } /** * @copydoc kfr::fir_bandstop */ template <typename T> -CMT_INLINE void fir_bandstop(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2, - const expression_pointer<T>& window, bool normalize = true) +KFR_INTRINSIC void fir_bandstop(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2, + const expression_pointer<T>& window, bool normalize = true) { - return intrinsics::fir_bandstop(taps, frequency1, frequency2, window, normalize); + return internal::fir_bandstop(taps, frequency1, frequency2, window, normalize); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/fracdelay.hpp b/include/kfr/dsp/fracdelay.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup fir * @{ */ /* @@ -30,12 +30,16 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ + template <typename T, typename E1> -CMT_INLINE internal::expression_short_fir<2, T, value_type_of<E1>, E1> fracdelay(E1&& e1, T delay) +KFR_INTRINSIC internal::expression_short_fir<2, T, value_type_of<E1>, E1> fracdelay(E1&& e1, T delay) { if (delay < 0) delay = 0; univector<T, 2> taps({ 1 - delay, delay }); return internal::expression_short_fir<2, T, value_type_of<E1>, E1>(std::forward<E1>(e1), taps); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -26,12 +26,15 @@ #pragma once #include "../base/basic_expressions.hpp" -#include "../base/complex.hpp" -#include "../base/sin_cos.hpp" -#include "../base/vec.hpp" +#include "../math/sin_cos.hpp" +#include "../simd/complex.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ + namespace internal { @@ -48,7 +51,7 @@ struct expression_goertzel : output_expression result.imag(q2 * sin(omega)); } template <typename U, size_t N> - CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x) + KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<U, N>& x) { vec<T, N> in = x; CMT_LOOP_UNROLL @@ -85,7 +88,7 @@ struct expression_parallel_goertzel : output_expression } } template <typename U, size_t N> - CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x) + KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<U, N>& x) { const vec<T, N> in = x; CMT_LOOP_UNROLL @@ -103,18 +106,19 @@ struct expression_parallel_goertzel : output_expression vec<T, width> q1; vec<T, width> q2; }; -}; // namespace internal +} // namespace internal template <typename T> -KFR_SINTRIN internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega) +KFR_INTRINSIC internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega) { return internal::expression_goertzel<T>(result, omega); } template <typename T, size_t width> -KFR_SINTRIN internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width], - const T (&omega)[width]) +KFR_INTRINSIC internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width], + const T (&omega)[width]) { return internal::expression_parallel_goertzel<T, width>(result, read<width>(omega)); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/interpolation.hpp b/include/kfr/dsp/interpolation.hpp @@ -1,72 +0,0 @@ -/** @addtogroup dsp - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../base/select.hpp" -#include "../base/sin_cos.hpp" -#include "../base/vec.hpp" - -namespace kfr -{ - -template <typename T, typename M> -KFR_FUNC T nearest(M mu, T x1, T x2) -{ - return select(mu < M(0.5), x1, x2); -} - -template <typename T, typename M> -KFR_FUNC T linear(M mu, T x1, T x2) -{ - return mix(mu, x1, x2); -} - -template <typename T, typename M> -KFR_FUNC T cosine(M mu, T x1, T x2) -{ - return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2); -} - -template <typename T, typename M> -KFR_FUNC T cubic(M mu, T x0, T x1, T x2, T x3) -{ - const T a0 = x3 - x2 - x0 + x1; - const T a1 = x0 - x1 - a0; - const T a2 = x2 - x0; - const T a3 = x1; - return horner(mu, a0, a1, a2, a3); -} - -template <typename T, typename M> -KFR_FUNC T catmullrom(M mu, T x0, T x1, T x2, T x3) -{ - const T a0 = T(0.5) * (x3 - x0) - T(1.5) * (x2 - x1); - const T a1 = x0 - T(2.5) * x1 + T(2) * x2 - T(0.5) * x3; - const T a2 = T(0.5) * (x2 - x0); - const T a3 = x1; - return horner(mu, a0, a1, a2, a3); -} -} // namespace kfr diff --git a/include/kfr/dsp/mixdown.hpp b/include/kfr/dsp/mixdown.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -29,6 +29,9 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ + /** * @brief Returns template expression that returns the sum of all the inputs */ @@ -43,12 +46,12 @@ namespace internal struct stereo_matrix { template <typename T, size_t N> - CMT_INLINE vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const + KFR_MEM_INTRINSIC vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const { - return process(x, csizeseq_t<N>()); + return process(x, csizeseq<N>); } template <typename T, size_t N, size_t... indices> - CMT_INLINE vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const + KFR_MEM_INTRINSIC vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const { return vec<vec<T, 2>, N>(hadd(transpose(x[indices] * matrix))...); } @@ -79,4 +82,5 @@ Result mixdown_stereo(Left&& left, Right&& right, const f64x2x2& matrix) return Result(internal::stereo_matrix{ matrix }, pack(std::forward<Left>(left), std::forward<Right>(right))); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/oscillators.hpp b/include/kfr/dsp/oscillators.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup oscillators * @{ */ /* @@ -26,19 +26,21 @@ #pragma once #include "../base/basic_expressions.hpp" -#include "../base/sin_cos.hpp" +#include "../math/sin_cos.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ template <typename T = fbase> -KFR_FUNC static auto phasor(identity<T> frequency, identity<T> sample_rate, identity<T> phase = 0) +KFR_FUNCTION static auto phasor(identity<T> frequency, identity<T> sample_rate, identity<T> phase = 0) { return fract(counter(phase, frequency / sample_rate)); } template <typename T = fbase> -KFR_FUNC static auto phasor(identity<T> frequency) +KFR_FUNCTION static auto phasor(identity<T> frequency) { return phasor(frequency, 1, 0); } @@ -46,76 +48,76 @@ KFR_FUNC static auto phasor(identity<T> frequency) namespace intrinsics { template <typename T> -KFR_FUNC T rawsine(const T& x) +KFR_INTRINSIC T rawsine(const T& x) { return intrinsics::fastsin(x * constants<T>::pi_s(2)); } template <typename T> -KFR_FUNC T sinenorm(const T& x) +KFR_INTRINSIC T sinenorm(const T& x) { return intrinsics::rawsine(fract(x)); } template <typename T> -KFR_FUNC T sine(const T& x) +KFR_INTRINSIC T sine(const T& x) { return intrinsics::sinenorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_FUNC T rawsquare(const T& x) +KFR_INTRINSIC T rawsquare(const T& x) { return select(x < T(0.5), T(1), -T(1)); } template <typename T> -KFR_FUNC T squarenorm(const T& x) +KFR_INTRINSIC T squarenorm(const T& x) { return intrinsics::rawsquare(fract(x)); } template <typename T> -KFR_FUNC T square(const T& x) +KFR_INTRINSIC T square(const T& x) { return intrinsics::squarenorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_FUNC T rawsawtooth(const T& x) +KFR_INTRINSIC T rawsawtooth(const T& x) { return T(1) - 2 * x; } template <typename T> -KFR_FUNC T sawtoothnorm(const T& x) +KFR_INTRINSIC T sawtoothnorm(const T& x) { return intrinsics::rawsawtooth(fract(x)); } template <typename T> -KFR_FUNC T sawtooth(const T& x) +KFR_INTRINSIC T sawtooth(const T& x) { return intrinsics::sawtoothnorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_FUNC T isawtoothnorm(const T& x) +KFR_INTRINSIC T isawtoothnorm(const T& x) { return T(-1) + 2 * fract(x + 0.5); } template <typename T> -KFR_FUNC T isawtooth(const T& x) +KFR_INTRINSIC T isawtooth(const T& x) { return intrinsics::isawtoothnorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_FUNC T rawtriangle(const T& x) +KFR_INTRINSIC T rawtriangle(const T& x) { return 1 - abs(4 * x - 2); } template <typename T> -KFR_FUNC T trianglenorm(const T& x) +KFR_INTRINSIC T trianglenorm(const T& x) { return intrinsics::rawtriangle(fract(x + 0.25)); } template <typename T> -KFR_FUNC T triangle(const T& x) +KFR_INTRINSIC T triangle(const T& x) { return intrinsics::trianglenorm(constants<T>::recip_pi_s(1, 2) * x); } @@ -136,143 +138,145 @@ KFR_I_FN(isawtooth) KFR_I_FN(isawtoothnorm) template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 rawsine(const T1& x) +KFR_FUNCTION T1 rawsine(const T1& x) { return intrinsics::rawsine(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::rawsine, E1> rawsine(E1&& x) +KFR_FUNCTION internal::expression_function<fn::rawsine, E1> rawsine(E1&& x) { return { fn::rawsine(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 sine(const T1& x) +KFR_FUNCTION T1 sine(const T1& x) { return intrinsics::sine(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sine, E1> sine(E1&& x) +KFR_FUNCTION internal::expression_function<fn::sine, E1> sine(E1&& x) { return { fn::sine(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 sinenorm(const T1& x) +KFR_FUNCTION T1 sinenorm(const T1& x) { return intrinsics::sinenorm(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sinenorm, E1> sinenorm(E1&& x) +KFR_FUNCTION internal::expression_function<fn::sinenorm, E1> sinenorm(E1&& x) { return { fn::sinenorm(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 rawsquare(const T1& x) +KFR_FUNCTION T1 rawsquare(const T1& x) { return intrinsics::rawsquare(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::rawsquare, E1> rawsquare(E1&& x) +KFR_FUNCTION internal::expression_function<fn::rawsquare, E1> rawsquare(E1&& x) { return { fn::rawsquare(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 square(const T1& x) +KFR_FUNCTION T1 square(const T1& x) { return intrinsics::square(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::square, E1> square(E1&& x) +KFR_FUNCTION internal::expression_function<fn::square, E1> square(E1&& x) { return { fn::square(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 squarenorm(const T1& x) +KFR_FUNCTION T1 squarenorm(const T1& x) { return intrinsics::squarenorm(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::squarenorm, E1> squarenorm(E1&& x) +KFR_FUNCTION internal::expression_function<fn::squarenorm, E1> squarenorm(E1&& x) { return { fn::squarenorm(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 rawtriangle(const T1& x) +KFR_FUNCTION T1 rawtriangle(const T1& x) { return intrinsics::rawtriangle(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::rawtriangle, E1> rawtriangle(E1&& x) +KFR_FUNCTION internal::expression_function<fn::rawtriangle, E1> rawtriangle(E1&& x) { return { fn::rawtriangle(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 triangle(const T1& x) +KFR_FUNCTION T1 triangle(const T1& x) { return intrinsics::triangle(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::triangle, E1> triangle(E1&& x) +KFR_FUNCTION internal::expression_function<fn::triangle, E1> triangle(E1&& x) { return { fn::triangle(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 trianglenorm(const T1& x) +KFR_FUNCTION T1 trianglenorm(const T1& x) { return intrinsics::trianglenorm(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::trianglenorm, E1> trianglenorm(E1&& x) +KFR_FUNCTION internal::expression_function<fn::trianglenorm, E1> trianglenorm(E1&& x) { return { fn::trianglenorm(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 rawsawtooth(const T1& x) +KFR_FUNCTION T1 rawsawtooth(const T1& x) { return intrinsics::rawsawtooth(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::rawsawtooth, E1> rawsawtooth(E1&& x) +KFR_FUNCTION internal::expression_function<fn::rawsawtooth, E1> rawsawtooth(E1&& x) { return { fn::rawsawtooth(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 sawtooth(const T1& x) +KFR_FUNCTION T1 sawtooth(const T1& x) { return intrinsics::sawtooth(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sawtooth, E1> sawtooth(E1&& x) +KFR_FUNCTION internal::expression_function<fn::sawtooth, E1> sawtooth(E1&& x) { return { fn::sawtooth(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 sawtoothnorm(const T1& x) +KFR_FUNCTION T1 sawtoothnorm(const T1& x) { return intrinsics::sawtoothnorm(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::sawtoothnorm, E1> sawtoothnorm(E1&& x) +KFR_FUNCTION internal::expression_function<fn::sawtoothnorm, E1> sawtoothnorm(E1&& x) { return { fn::sawtoothnorm(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 isawtooth(const T1& x) +KFR_FUNCTION T1 isawtooth(const T1& x) { return intrinsics::isawtooth(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::isawtooth, E1> isawtooth(E1&& x) +KFR_FUNCTION internal::expression_function<fn::isawtooth, E1> isawtooth(E1&& x) { return { fn::isawtooth(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_FUNC T1 isawtoothnorm(const T1& x) +KFR_FUNCTION T1 isawtoothnorm(const T1& x) { return intrinsics::isawtoothnorm(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_FUNC internal::expression_function<fn::isawtoothnorm, E1> isawtoothnorm(E1&& x) +KFR_FUNCTION internal::expression_function<fn::isawtoothnorm, E1> isawtoothnorm(E1&& x) { return { fn::isawtoothnorm(), std::forward<E1>(x) }; } +} // namespace CMT_ARCH_NAME + } // namespace kfr diff --git a/include/kfr/dsp/sample_rate_conversion.hpp b/include/kfr/dsp/sample_rate_conversion.hpp @@ -25,14 +25,17 @@ */ #pragma once -#include "../base/function.hpp" #include "../base/memory.hpp" #include "../base/reduce.hpp" -#include "../base/vec.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/vec.hpp" #include "window.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ + enum class sample_rate_conversion_quality : int { draft = 4, @@ -52,32 +55,32 @@ struct samplerate_converter using ftype = subtype<T>; private: - KFR_INTRIN ftype window(ftype n) const + KFR_MEM_INTRINSIC ftype window(ftype n) const { return modzerobessel(kaiser_beta * sqrt(1 - sqr(2 * n - 1))) * reciprocal(modzerobessel(kaiser_beta)); } - KFR_INTRIN ftype sidelobe_att() const { return kaiser_beta / 0.1102 + 8.7; } - KFR_INTRIN ftype transition_width() const { return (sidelobe_att() - 8) / (depth - 1) / 2.285; } + KFR_MEM_INTRINSIC ftype sidelobe_att() const { return kaiser_beta / 0.1102 + 8.7; } + KFR_MEM_INTRINSIC ftype transition_width() const { return (sidelobe_att() - 8) / (depth - 1) / 2.285; } public: - static KFR_INTRIN size_t filter_order(sample_rate_conversion_quality quality) + static KFR_MEM_INTRINSIC size_t filter_order(sample_rate_conversion_quality quality) { - return 1 << (static_cast<int>(quality) + 1); + return size_t(1) << (static_cast<int>(quality) + 1); } /// @brief Returns sidelobe attenuation for the given quality (in dB) - static KFR_INTRIN ftype sidelobe_attenuation(sample_rate_conversion_quality quality) + static KFR_MEM_INTRINSIC ftype sidelobe_attenuation(sample_rate_conversion_quality quality) { return (static_cast<int>(quality) - 3) * ftype(20); } /// @brief Returns transition width for the given quality (in rad) - static KFR_INTRIN ftype transition_width(sample_rate_conversion_quality quality) + static KFR_MEM_INTRINSIC ftype transition_width(sample_rate_conversion_quality quality) { return (sidelobe_attenuation(quality) - 8) / (filter_order(quality) - 1) / ftype(2.285); } - static KFR_INTRIN ftype window_param(sample_rate_conversion_quality quality) + static KFR_MEM_INTRINSIC ftype window_param(sample_rate_conversion_quality quality) { const ftype att = sidelobe_attenuation(quality); if (att > 50) @@ -112,7 +115,8 @@ public: for (itype j = 0, jj = 0; j < taps; j++) { - filter[size_t(j)] = sinc((jj - halftaps) * cutoff * c_pi<ftype, 2>) * window(ftype(jj) / ftype(taps - 1)); + filter[size_t(j)] = + sinc((jj - halftaps) * cutoff * c_pi<ftype, 2>) * window(ftype(jj) / ftype(taps - 1)); jj += size_t(interpolation_factor); if (jj >= taps) jj = jj - taps + 1; @@ -122,25 +126,31 @@ public: filter = filter * s; } - itype input_position_to_intermediate(itype in_pos) const { return in_pos * interpolation_factor; } - itype output_position_to_intermediate(itype out_pos) const { return out_pos * decimation_factor; } + KFR_MEM_INTRINSIC itype input_position_to_intermediate(itype in_pos) const + { + return in_pos * interpolation_factor; + } + KFR_MEM_INTRINSIC itype output_position_to_intermediate(itype out_pos) const + { + return out_pos * decimation_factor; + } - itype input_position_to_output(itype in_pos) const + KFR_MEM_INTRINSIC itype input_position_to_output(itype in_pos) const { return floor_div(input_position_to_intermediate(in_pos), decimation_factor).quot; } - itype output_position_to_input(itype out_pos) const + KFR_MEM_INTRINSIC itype output_position_to_input(itype out_pos) const { return floor_div(output_position_to_intermediate(out_pos), interpolation_factor).quot; } - itype output_size_for_input(itype input_size) const + KFR_MEM_INTRINSIC itype output_size_for_input(itype input_size) const { return input_position_to_output(input_position + input_size - 1) - input_position_to_output(input_position - 1); } - itype input_size_for_output(itype output_size) const + KFR_MEM_INTRINSIC itype input_size_for_output(itype output_size) const { return output_position_to_input(output_position + output_size - 1) - output_position_to_input(output_position - 1); @@ -183,7 +193,6 @@ public: const std::lldiv_t input_pos = floor_div(intermediate_start + interpolation_factor - 1, interpolation_factor); const itype input_start = input_pos.quot; // first input sample - const itype input_end = input_start + depth; const itype tap_start = interpolation_factor - 1 - input_pos.rem; const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(tap_start * depth)); @@ -219,8 +228,8 @@ public: return required_input_size; } - double get_fractional_delay() const { return (taps - 1) * 0.5 / decimation_factor; } - size_t get_delay() const { return static_cast<size_t>(get_fractional_delay()); } + KFR_MEM_INTRINSIC double get_fractional_delay() const { return (taps - 1) * 0.5 / decimation_factor; } + KFR_MEM_INTRINSIC size_t get_delay() const { return static_cast<size_t>(get_fractional_delay()); } ftype kaiser_beta; itype depth; @@ -244,130 +253,140 @@ template <size_t factor, size_t offset, typename E> struct expression_downsample; template <typename E> -struct expression_upsample<2, E> : expression_base<E> +struct expression_upsample<2, E> : expression_with_arguments<E> { - using expression_base<E>::expression_base; + using expression_with_arguments<E>::expression_with_arguments; using value_type = value_type_of<E>; using T = value_type; - size_t size() const noexcept { return expression_base<E>::size() * 2; } + KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() * 2; } template <size_t N> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_upsample& self, cinput_t cinput, + size_t index, vec_shape<T, N>) { - const vec<T, N / 2> x = this->argument_first(cinput, index / 2, vec_t<T, N / 2>()); + const vec<T, N / 2> x = self.argument_first(cinput, index / 2, vec_shape<T, N / 2>()); return interleave(x, zerovector(x)); } - vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const + KFR_INTRINSIC friend vec<T, 1> get_elements(const expression_upsample& self, cinput_t cinput, + size_t index, vec_shape<T, 1>) { if (index & 1) return 0; else - return this->argument_first(cinput, index / 2, vec_t<T, 1>()); + return self.argument_first(cinput, index / 2, vec_shape<T, 1>()); } }; template <typename E> -struct expression_upsample<4, E> : expression_base<E> +struct expression_upsample<4, E> : expression_with_arguments<E> { - using expression_base<E>::expression_base; + using expression_with_arguments<E>::expression_with_arguments; using value_type = value_type_of<E>; using T = value_type; - size_t size() const noexcept { return expression_base<E>::size() * 4; } + KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() * 4; } template <size_t N> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_upsample& self, cinput_t cinput, + size_t index, vec_shape<T, N>) CMT_NOEXCEPT { - const vec<T, N / 4> x = this->argument_first(cinput, index / 4, vec_t<T, N / 4>()); + const vec<T, N / 4> x = self.argument_first(cinput, index / 4, vec_shape<T, N / 4>()); const vec<T, N / 2> xx = interleave(x, zerovector(x)); return interleave(xx, zerovector(xx)); } - vec<T, 2> operator()(cinput_t cinput, size_t index, vec_t<T, 2>) const + KFR_INTRINSIC friend vec<T, 2> get_elements(const expression_upsample& self, cinput_t cinput, + size_t index, vec_shape<T, 2>) CMT_NOEXCEPT { switch (index & 3) { case 0: - return interleave(this->argument_first(cinput, index / 4, vec_t<T, 1>()), zerovector<T, 1>()); + return interleave(self.argument_first(cinput, index / 4, vec_shape<T, 1>()), zerovector<T, 1>()); case 3: - return interleave(zerovector<T, 1>(), this->argument_first(cinput, index / 4, vec_t<T, 1>())); + return interleave(zerovector<T, 1>(), self.argument_first(cinput, index / 4, vec_shape<T, 1>())); default: return 0; } } - vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const + KFR_INTRINSIC friend vec<T, 1> get_elements(const expression_upsample& self, cinput_t cinput, + size_t index, vec_shape<T, 1>) CMT_NOEXCEPT { if (index & 3) return 0; else - return this->argument_first(cinput, index / 4, vec_t<T, 1>()); + return self.argument_first(cinput, index / 4, vec_shape<T, 1>()); } }; template <typename E, size_t offset> -struct expression_downsample<2, offset, E> : expression_base<E> +struct expression_downsample<2, offset, E> : expression_with_arguments<E> { - using expression_base<E>::expression_base; + using expression_with_arguments<E>::expression_with_arguments; using value_type = value_type_of<E>; using T = value_type; - size_t size() const noexcept { return expression_base<E>::size() / 2; } + KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() / 2; } template <size_t N> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_downsample& self, cinput_t cinput, + size_t index, vec_shape<T, N>) CMT_NOEXCEPT { - const vec<T, N* 2> x = this->argument_first(cinput, index * 2, vec_t<T, N * 2>()); - return x.shuffle(csizeseq_t<N, offset, 2>()); + const vec<T, N* 2> x = self.argument_first(cinput, index * 2, vec_shape<T, N * 2>()); + return x.shuffle(csizeseq<N, offset, 2>); } }; template <typename E, size_t offset> -struct expression_downsample<4, offset, E> : expression_base<E> +struct expression_downsample<4, offset, E> : expression_with_arguments<E> { - using expression_base<E>::expression_base; + using expression_with_arguments<E>::expression_with_arguments; using value_type = value_type_of<E>; using T = value_type; - size_t size() const noexcept { return expression_base<E>::size() / 4; } + KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() / 4; } template <size_t N> - vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_downsample& self, cinput_t cinput, + size_t index, vec_shape<T, N>) CMT_NOEXCEPT { - const vec<T, N* 4> x = this->argument_first(cinput, index * 4, vec_t<T, N * 4>()); - return x.shuffle(csizeseq_t<N, offset, 4>()); + const vec<T, N* 4> x = self.argument_first(cinput, index * 4, vec_shape<T, N * 4>()); + return x.shuffle(csizeseq<N, offset, 4>); } }; } // namespace internal template <typename E1, size_t offset = 0> -CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize_t<0>()) +KFR_FUNCTION internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, + csize_t<offset> = csize_t<0>()) { return internal::expression_downsample<2, offset, E1>(std::forward<E1>(e1)); } template <typename E1, size_t offset = 0> -CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize_t<0>()) +KFR_FUNCTION internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, + csize_t<offset> = csize_t<0>()) { return internal::expression_downsample<4, offset, E1>(std::forward<E1>(e1)); } template <typename E1> -CMT_INLINE internal::expression_upsample<2, E1> upsample2(E1&& e1) +KFR_FUNCTION internal::expression_upsample<2, E1> upsample2(E1&& e1) { return internal::expression_upsample<2, E1>(std::forward<E1>(e1)); } template <typename E1> -CMT_INLINE internal::expression_upsample<4, E1> upsample4(E1&& e1) +KFR_FUNCTION internal::expression_upsample<4, E1> upsample4(E1&& e1) { return internal::expression_upsample<4, E1>(std::forward<E1>(e1)); } template <typename T = fbase> -inline samplerate_converter<T> sample_rate_converter(sample_rate_conversion_quality quality, - size_t interpolation_factor, size_t decimation_factor, - subtype<T> scale = subtype<T>(1), - subtype<T> cutoff = 0.5f) +KFR_FUNCTION samplerate_converter<T> sample_rate_converter(sample_rate_conversion_quality quality, + size_t interpolation_factor, + size_t decimation_factor, + subtype<T> scale = subtype<T>(1), + subtype<T> cutoff = 0.5f) { using itype = typename samplerate_converter<T>::itype; return samplerate_converter<T>(quality, itype(interpolation_factor), itype(decimation_factor), scale, @@ -376,12 +395,13 @@ inline samplerate_converter<T> sample_rate_converter(sample_rate_conversion_qual // Deprecated in 0.9.2 template <typename T = fbase> -inline samplerate_converter<T> resampler(sample_rate_conversion_quality quality, size_t interpolation_factor, - size_t decimation_factor, subtype<T> scale = subtype<T>(1), - subtype<T> cutoff = 0.5f) +KFR_FUNCTION samplerate_converter<T> resampler(sample_rate_conversion_quality quality, + size_t interpolation_factor, size_t decimation_factor, + subtype<T> scale = subtype<T>(1), subtype<T> cutoff = 0.5f) { using itype = typename samplerate_converter<T>::itype; return samplerate_converter<T>(quality, itype(interpolation_factor), itype(decimation_factor), scale, cutoff); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/speaker.hpp b/include/kfr/dsp/speaker.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -27,6 +27,8 @@ namespace kfr { +inline namespace CMT_ARCH_NAME +{ enum class Speaker : int { @@ -93,4 +95,5 @@ enum class SpeakerArrangement : int Music81 = 27, Arr102 = 28 }; +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/special.hpp b/include/kfr/dsp/special.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -26,16 +26,19 @@ #pragma once #include "../base/basic_expressions.hpp" -#include "../base/operators.hpp" -#include "../base/vec.hpp" +#include "../simd/operators.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ + /** * @brief Returns expression template that generates a unit impulse */ template <typename T = int> -static auto unitimpulse() +auto unitimpulse() { return lambda<T>([](cinput_t, size_t index, auto x) { if (index == 0) @@ -46,7 +49,7 @@ static auto unitimpulse() } template <typename T = fbase> -static auto jaehne_arg(size_t size) +auto jaehne_arg(size_t size) { return truncate(constants<T>::pi_s(1, 2) * sqr(linspace(T(0), T(size), size, false)) / size, size); } @@ -56,13 +59,13 @@ static auto jaehne_arg(size_t size) * Generates the sine with linearly increasing frequency from 0hz to nyquist frequency. */ template <typename T = fbase> -static auto jaehne(identity<T> magn, size_t size) +auto jaehne(identity<T> magn, size_t size) { return magn * sin(jaehne_arg<T>(size)); } template <typename T = fbase> -static auto swept_arg(size_t size) +auto swept_arg(size_t size) { return truncate(constants<T>::pi_s(1, 4) * sqr(sqr(linspace(T(0), T(size), size, false)) / sqr(T(size))) * T(size), @@ -74,8 +77,9 @@ static auto swept_arg(size_t size) * Generates the sine with logarithmically increasing frequency from 0hz to nyquist frequency. */ template <typename T = fbase> -static auto swept(identity<T> magn, size_t size) +auto swept(identity<T> magn, size_t size) { return magn * sin(swept_arg<T>(size)); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -25,41 +25,43 @@ */ #pragma once -#include "../base/abs.hpp" #include "../base/basic_expressions.hpp" -#include "../base/log_exp.hpp" -#include "../base/vec.hpp" +#include "../math/abs.hpp" +#include "../math/log_exp.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ using sample_rate_t = double; namespace intrinsics { template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF amp_to_dB(const T& amp) +KFR_INTRINSIC TF amp_to_dB(const T& amp) { return log(static_cast<TF>(abs(amp))) * subtype<TF>(8.6858896380650365530225783783322); // return T( 20.0 ) * log10( level ); } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF dB_to_amp(const T& dB) +KFR_INTRINSIC TF dB_to_amp(const T& dB) { return exp(dB * subtype<TF>(0.11512925464970228420089957273422)); // return exp10( dB / 20 ); } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF amp_to_dB(const T& amp, const T& offset) +KFR_INTRINSIC TF amp_to_dB(const T& amp, const T& offset) { return log_fmadd(static_cast<TF>(abs(amp)), subtype<TF>(8.6858896380650365530225783783322), offset); // return T( 20.0 ) * log10( level ); } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset) +KFR_INTRINSIC TF dB_to_amp(const T& dB, const T& offset) { auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset; return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs); @@ -67,13 +69,13 @@ KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset) } template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout power_to_dB(const T& x) +KFR_INTRINSIC Tout power_to_dB(const T& x) { return log(static_cast<Tout>(abs(x))) * (10 * c_recip_log_10<Tout>); } template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout dB_to_power(const T& x) +KFR_INTRINSIC Tout dB_to_power(const T& x) { if (x == -c_infinity<Tout>) return 0.0; @@ -82,7 +84,7 @@ KFR_SINTRIN Tout dB_to_power(const T& x) } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF note_to_hertz(const T& note) +KFR_INTRINSIC TF note_to_hertz(const T& note) { const subtype<TF> offset = 2.1011784386926213177653145771814; @@ -90,7 +92,7 @@ KFR_SINTRIN TF note_to_hertz(const T& note) } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF hertz_to_note(const T& hertz) +KFR_INTRINSIC TF hertz_to_note(const T& hertz) { const subtype<TF> offset = -36.376316562295915248836189714583; @@ -98,7 +100,7 @@ KFR_SINTRIN TF hertz_to_note(const T& hertz) } template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>> -KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz) +KFR_INTRINSIC Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz) { const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818); @@ -106,7 +108,7 @@ KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tuneh } template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>> -KFR_SINTRIN Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz) +KFR_INTRINSIC Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz) { const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023); @@ -121,74 +123,75 @@ KFR_I_FN(power_to_dB) KFR_I_FN(dB_to_power) template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> note_to_hertz(const T1& x) +KFR_FUNCTION flt_type<T1> note_to_hertz(const T1& x) { return intrinsics::note_to_hertz(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::note_to_hertz, E1> note_to_hertz(E1&& x) +KFR_FUNCTION internal::expression_function<fn::note_to_hertz, E1> note_to_hertz(E1&& x) { return { fn::note_to_hertz(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> hertz_to_note(const T1& x) +KFR_FUNCTION flt_type<T1> hertz_to_note(const T1& x) { return intrinsics::hertz_to_note(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::hertz_to_note, E1> hertz_to_note(E1&& x) +KFR_FUNCTION internal::expression_function<fn::hertz_to_note, E1> hertz_to_note(E1&& x) { return { fn::hertz_to_note(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> amp_to_dB(const T1& x) +KFR_FUNCTION flt_type<T1> amp_to_dB(const T1& x) { return intrinsics::amp_to_dB(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::amp_to_dB, E1> amp_to_dB(E1&& x) +KFR_INTRINSIC internal::expression_function<fn::amp_to_dB, E1> amp_to_dB(E1&& x) { return { fn::amp_to_dB(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> dB_to_amp(const T1& x) +KFR_FUNCTION flt_type<T1> dB_to_amp(const T1& x) { return intrinsics::dB_to_amp(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::dB_to_amp, E1> dB_to_amp(E1&& x) +KFR_FUNCTION internal::expression_function<fn::dB_to_amp, E1> dB_to_amp(E1&& x) { return { fn::dB_to_amp(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> power_to_dB(const T1& x) +KFR_FUNCTION flt_type<T1> power_to_dB(const T1& x) { return intrinsics::power_to_dB(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::power_to_dB, E1> power_to_dB(E1&& x) +KFR_FUNCTION internal::expression_function<fn::power_to_dB, E1> power_to_dB(E1&& x) { return { fn::power_to_dB(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN flt_type<T1> dB_to_power(const T1& x) +KFR_FUNCTION flt_type<T1> dB_to_power(const T1& x) { return intrinsics::dB_to_power(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::dB_to_power, E1> dB_to_power(E1&& x) +KFR_FUNCTION internal::expression_function<fn::dB_to_power, E1> dB_to_power(E1&& x) { return { fn::dB_to_power(), std::forward<E1>(x) }; } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/waveshaper.hpp b/include/kfr/dsp/waveshaper.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -25,12 +25,15 @@ */ #pragma once -#include "../base/clamp.hpp" -#include "../base/hyperbolic.hpp" -#include "../base/operators.hpp" +#include "../math/clamp.hpp" +#include "../math/hyperbolic.hpp" +#include "../simd/operators.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ + template <typename E1> inline auto waveshaper_hardclip(E1&& input, double clip_level) { @@ -44,7 +47,7 @@ inline auto waveshaper_tanh(E1&& input, double saturation) } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -CMT_FUNC flt_type<T1> saturate_I(const T1& x) +KFR_FUNCTION flt_type<T1> saturate_I(const T1& x) { const flt_type<T1> xx = -1 / (abs(static_cast<flt_type<T1>>(x)) + 1) + 1; return mulsign(xx, static_cast<flt_type<T1>>(x)); @@ -52,7 +55,7 @@ CMT_FUNC flt_type<T1> saturate_I(const T1& x) KFR_FN(saturate_I) template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -CMT_FUNC flt_type<T1> saturate_II(const T1& x) +KFR_FUNCTION flt_type<T1> saturate_II(const T1& x) { const flt_type<T1> xx = sqr(abs(static_cast<flt_type<T1>>(x)) + 1); return mulsign((xx - 1) / (xx + 1), static_cast<flt_type<T1>>(x)); @@ -60,13 +63,13 @@ CMT_FUNC flt_type<T1> saturate_II(const T1& x) KFR_FN(saturate_II) template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_FUNC internal::expression_function<fn::saturate_II, E1> saturate_I(E1&& x) +KFR_FUNCTION internal::expression_function<fn::saturate_II, E1> saturate_I(E1&& x) { return { fn::saturate_I(), std::forward<E1>(x) }; } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -CMT_FUNC internal::expression_function<fn::saturate_II, E1> saturate_II(E1&& x) +KFR_FUNCTION internal::expression_function<fn::saturate_II, E1> saturate_II(E1&& x) { return { fn::saturate_II(), std::forward<E1>(x) }; } @@ -88,4 +91,5 @@ inline auto waveshaper_poly(E1&& input, fbase c1, fbase c3, Cs... cs) { return horner_odd(input, c1, c3, static_cast<fbase>(cs)...); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/weighting.hpp b/include/kfr/dsp/weighting.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup dsp_extra * @{ */ /* @@ -25,16 +25,19 @@ */ #pragma once -#include "../base/operators.hpp" -#include "../base/sqrt.hpp" +#include "../math/sqrt.hpp" +#include "../simd/operators.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ + namespace intrinsics { template <typename T> -KFR_SINTRIN T weight_a_unnorm(T f) +KFR_INTRINSIC T weight_a_unnorm(T f) { const T f2 = pow2(f); const T nom = pow2(12200) * pow4(f); @@ -46,13 +49,13 @@ template <typename T> const static T weight_a_gain = reciprocal(weight_a_unnorm(T(1000.0))); template <typename T> -KFR_SINTRIN T aweighting(T f) +KFR_INTRINSIC T aweighting(T f) { return weight_a_unnorm(f) * weight_a_gain<subtype<T>>; } template <typename T> -KFR_SINTRIN T weight_b_unnorm(T f) +KFR_INTRINSIC T weight_b_unnorm(T f) { const T f2 = pow2(f); const T nom = pow2(12200) * pow3(f); @@ -65,13 +68,13 @@ template <typename T> const static T weight_b_gain = reciprocal(weight_b_unnorm(T(1000.0))); template <typename T> -KFR_SINTRIN T bweighting(T f) +KFR_INTRINSIC T bweighting(T f) { return weight_b_unnorm(f) * weight_b_gain<subtype<T>>; } template <typename T> -KFR_SINTRIN T weight_c_unnorm(T f) +KFR_INTRINSIC T weight_c_unnorm(T f) { const T f2 = pow2(f); const T nom = pow2(12200) * f2; @@ -84,7 +87,7 @@ template <typename T> const static T weight_c_gain = reciprocal(weight_c_unnorm(T(1000.0))); template <typename T> -KFR_SINTRIN T cweighting(T f) +KFR_INTRINSIC T cweighting(T f) { return weight_c_unnorm(f) * weight_c_gain<subtype<T>>; } @@ -94,38 +97,39 @@ KFR_I_FN(bweighting) KFR_I_FN(cweighting) template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 aweighting(const T1& x) +KFR_INTRINSIC T1 aweighting(const T1& x) { return intrinsics::aweighting(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::aweighting, E1> aweighting(E1&& x) +KFR_INTRINSIC internal::expression_function<fn::aweighting, E1> aweighting(E1&& x) { return { fn::aweighting(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 bweighting(const T1& x) +KFR_INTRINSIC T1 bweighting(const T1& x) { return intrinsics::bweighting(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::bweighting, E1> bweighting(E1&& x) +KFR_INTRINSIC internal::expression_function<fn::bweighting, E1> bweighting(E1&& x) { return { fn::bweighting(), std::forward<E1>(x) }; } template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> -KFR_INTRIN T1 cweighting(const T1& x) +KFR_INTRINSIC T1 cweighting(const T1& x) { return intrinsics::cweighting(x); } template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_INTRIN internal::expression_function<fn::cweighting, E1> cweighting(E1&& x) +KFR_INTRINSIC internal::expression_function<fn::cweighting, E1> cweighting(E1&& x) { return { fn::cweighting(), std::forward<E1>(x) }; } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp @@ -1,4 +1,4 @@ -/** @addtogroup dsp +/** @addtogroup window * @{ */ /* @@ -25,15 +25,17 @@ */ #pragma once -#include "../base/log_exp.hpp" -#include "../base/modzerobessel.hpp" #include "../base/pointer.hpp" -#include "../base/sin_cos.hpp" -#include "../base/sqrt.hpp" -#include "../base/vec.hpp" +#include "../math/log_exp.hpp" +#include "../math/modzerobessel.hpp" +#include "../math/sin_cos.hpp" +#include "../math/sqrt.hpp" +#include "../simd/vec.hpp" namespace kfr { +inline namespace CMT_ARCH_NAME +{ enum class window_type { @@ -125,11 +127,12 @@ struct expression_rectangular : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_rectangular& self, cinput_t, + size_t index, vec_shape<T, N>) { using TI = utype<T>; - const vec<TI, N> i = enumerate(vec<TI, N>()) + cast<TI>(index); - return select(i < cast<TI>(m_size), T(1), T(0)); + const vec<TI, N> i = enumerate(vec<TI, N>()) + static_cast<TI>(index); + return select(i < static_cast<TI>(self.m_size), T(1), T(0)); } size_t size() const { return m_size; } @@ -147,9 +150,10 @@ struct expression_triangular : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_triangular& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return 1 - abs(linspace(cinput, index, y)); + return 1 - abs(get_elements(self.linspace, cinput, index, y)); } size_t size() const { return m_size; } @@ -168,9 +172,10 @@ struct expression_bartlett : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bartlett& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return 1 - abs(linspace(cinput, index, y)); + return 1 - abs(get_elements(self.linspace, cinput, index, y)); } size_t size() const { return m_size; } @@ -189,9 +194,10 @@ struct expression_cosine : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_cosine& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return sin(c_pi<T> * linspace(cinput, index, y)); + return sin(c_pi<T> * get_elements(self.linspace, cinput, index, y)); } size_t size() const { return m_size; } @@ -210,9 +216,10 @@ struct expression_hann : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_hann& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y))); + return T(0.5) * (T(1) - cos(c_pi<T, 2> * get_elements(self.linspace, cinput, index, y))); } size_t size() const { return m_size; } @@ -231,9 +238,10 @@ struct expression_bartlett_hann : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bartlett_hann& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - const vec<T, N> xx = linspace(cinput, index, y); + const vec<T, N> xx = get_elements(self.linspace, cinput, index, y); return T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5))); } size_t size() const { return m_size; } @@ -253,9 +261,11 @@ struct expression_hamming : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_hamming& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return alpha - (T(1.0) - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y))); + return self.alpha - + (T(1.0) - self.alpha) * (cos(c_pi<T, 2> * get_elements(self.linspace, cinput, index, y))); } size_t size() const { return m_size; } @@ -275,9 +285,10 @@ struct expression_bohman : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bohman& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - const vec<T, N> n = abs(linspace(cinput, index, y)); + const vec<T, N> n = abs(get_elements(self.linspace, cinput, index, y)); return (T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n); } size_t size() const { return m_size; } @@ -297,10 +308,11 @@ struct expression_blackman : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_blackman& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - const vec<T, N> n = linspace(cinput, index, y); - return a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n); + const vec<T, N> n = get_elements(self.linspace, cinput, index, y); + return self.a0 - self.a1 * cos(c_pi<T, 2> * n) + self.a2 * cos(c_pi<T, 4> * n); } size_t size() const { return m_size; } @@ -320,9 +332,10 @@ struct expression_blackman_harris : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_blackman_harris& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>; + const vec<T, N> n = get_elements(self.linspace, cinput, index, y) * c_pi<T, 2>; return T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n); } size_t size() const { return m_size; } @@ -343,9 +356,11 @@ struct expression_kaiser : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_kaiser& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m; + return modzerobessel(self.beta * sqrt(1 - sqr(get_elements(self.linspace, cinput, index, y)))) * + self.m; } size_t size() const { return m_size; } @@ -366,9 +381,10 @@ struct expression_flattop : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_flattop& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>; + const vec<T, N> n = get_elements(self.linspace, cinput, index, y) * c_pi<T, 2>; constexpr T a0 = 1; constexpr T a1 = 1.93; constexpr T a2 = 1.29; @@ -393,9 +409,10 @@ struct expression_gaussian : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_gaussian& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return exp(T(-0.5) * sqr(alpha * linspace(cinput, index, y))); + return exp(T(-0.5) * sqr(self.alpha * get_elements(self.linspace, cinput, index, y))); } size_t size() const { return m_size; } @@ -416,9 +433,10 @@ struct expression_lanczos : input_expression { } template <size_t N> - CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const + KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lanczos& self, cinput_t cinput, + size_t index, vec_shape<T, N> y) { - return sinc(linspace(cinput, index, y)); + return sinc(get_elements(self.linspace, cinput, index, y)); } size_t size() const { return m_size; } @@ -458,7 +476,7 @@ KFR_WINDOW_BY_TYPE(lanczos) /** * @brief Returns template expression that generates Rrectangular window of length @c size */ -CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t size) +KFR_FUNCTION internal::expression_rectangular<fbase> window_rectangular(size_t size) { return internal::expression_rectangular<fbase>(size, fbase()); } @@ -467,7 +485,7 @@ CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t siz * @brief Returns template expression that generates Triangular window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_triangular<T>(size); } @@ -476,7 +494,7 @@ CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, cty * @brief Returns template expression that generates Bartlett window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_bartlett<T>(size); } @@ -485,7 +503,7 @@ CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t * @brief Returns template expression that generates Cosine window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_cosine<T>(size); } @@ -494,7 +512,7 @@ CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> * @brief Returns template expression that generates Hann window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_hann<T>(size); } @@ -503,7 +521,8 @@ CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ct * @brief Returns template expression that generates Bartlett-Hann window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, + ctype_t<T> = ctype_t<T>()) { return internal::expression_bartlett_hann<T>(size); } @@ -513,8 +532,8 @@ CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t siz * alpha */ template <typename T = fbase> -CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54, - ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54, + ctype_t<T> = ctype_t<T>()) { return internal::expression_hamming<T>(size, alpha); } @@ -523,7 +542,7 @@ CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity< * @brief Returns template expression that generates Bohman window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_bohman<T>(size); } @@ -533,7 +552,7 @@ CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> * alpha */ template <typename T = fbase> -CMT_INLINE internal::expression_blackman<T> window_blackman( +KFR_FUNCTION internal::expression_blackman<T> window_blackman( size_t size, identity<T> alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric, ctype_t<T> = ctype_t<T>()) { @@ -544,7 +563,7 @@ CMT_INLINE internal::expression_blackman<T> window_blackman( * @brief Returns template expression that generates Blackman-Harris window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris( +KFR_FUNCTION internal::expression_blackman_harris<T> window_blackman_harris( size_t size, window_symmetry symmetry = window_symmetry::symmetric, ctype_t<T> = ctype_t<T>()) { return internal::expression_blackman_harris<T>(size, T(), symmetry); @@ -555,8 +574,8 @@ CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris( * beta */ template <typename T = fbase> -CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5), - ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5), + ctype_t<T> = ctype_t<T>()) { return internal::expression_kaiser<T>(size, beta); } @@ -565,7 +584,7 @@ CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> * @brief Returns template expression that generates Flat top window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_flattop<T>(size); } @@ -575,8 +594,8 @@ CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T * alpha */ template <typename T = fbase> -CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5, - ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5, + ctype_t<T> = ctype_t<T>()) { return internal::expression_gaussian<T>(size, alpha); } @@ -585,7 +604,7 @@ CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identit * @brief Returns template expression that generates Lanczos window of length @c size */ template <typename T = fbase> -CMT_INLINE internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>()) +KFR_FUNCTION internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>()) { return internal::expression_lanczos<T>(size); } @@ -615,6 +634,7 @@ CMT_NOINLINE expression_pointer<T> window(size_t size, window_type type, identit return to_pointer( typename internal::window_by_type<window>::template type<T>(size, win_param, symmetry)); }, - fn::returns<expression_pointer<T>>()); + fn_generic::returns<expression_pointer<T>>()); } +} // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/ext/console_colors.hpp b/include/kfr/ext/console_colors.hpp @@ -1,162 +0,0 @@ -#pragma once -#include <cstdint> -#include <cstdio> - -//#define CONSOLE_COLORS_FORCE_ASCII - -#if defined _WIN32 && !defined PRINT_COLORED_FORCE_ASCII -#define USE_WIN32_API -#endif - -#if defined(USE_WIN32_API) - -namespace win32_lite -{ -typedef void* HANDLE; -typedef uint32_t DWORD; - -#define WIN32_LITE_STD_INPUT_HANDLE ((win32_lite::DWORD)-10) -#define WIN32_LITE_STD_OUTPUT_HANDLE ((win32_lite::DWORD)-11) -#define WIN32_LITE_STD_ERROR_HANDLE ((win32_lite::DWORD)-12) - -#define WIN32_LITE_DECLSPEC_IMPORT __declspec(dllimport) - -#define WIN32_LITE_WINAPI __stdcall - -typedef short SHORT; -typedef unsigned short WORD; -typedef int WINBOOL; - -extern "C" -{ - WIN32_LITE_DECLSPEC_IMPORT HANDLE WIN32_LITE_WINAPI GetStdHandle(DWORD nStdHandle); - WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleTextAttribute(HANDLE hConsoleOutput, - WORD wAttributes); -} -} // namespace win32_lite - -#endif - -namespace console_colors -{ - -enum text_color : uint32_t -{ - Black = 0x00, - DarkBlue = 0x01, - DarkGreen = 0x02, - DarkCyan = 0x03, - DarkRed = 0x04, - DarkMagenta = 0x05, - DarkYellow = 0x06, - LightGrey = 0x07, - Gray = 0x08, - Blue = 0x09, - Green = 0x0A, - Cyan = 0x0B, - Red = 0x0C, - Magenta = 0x0D, - Yellow = 0x0E, - White = 0x0F, - BgBlack = 0x00, - BgDarkBlue = 0x10, - BgDarkGreen = 0x20, - BgDarkCyan = 0x30, - BgDarkRed = 0x40, - BgDarkMagenta = 0x50, - BgDarkYellow = 0x60, - BgLightGrey = 0x70, - BgGray = 0x80, - BgBlue = 0x90, - BgGreen = 0xA0, - BgCyan = 0xB0, - BgRed = 0xC0, - BgMagenta = 0xD0, - BgYellow = 0xE0, - BgWhite = 0xF0, - - Normal = BgBlack | LightGrey -}; - -enum console_buffer -{ - ConsoleStdOutput, - ConsoleStdError -}; - -struct console_color -{ -public: - console_color(text_color c, console_buffer console = ConsoleStdOutput) - : m_old(get(console)), m_console(console) - { - set(c, m_console); - } - - ~console_color() { set(m_old, m_console); } - -private: - text_color get(console_buffer console = ConsoleStdOutput) { return saved_color(); } - - void set(text_color new_color, console_buffer console = ConsoleStdOutput) - { -#ifdef USE_WIN32_API - win32_lite::SetConsoleTextAttribute(win32_lite::GetStdHandle(console == ConsoleStdOutput - ? WIN32_LITE_STD_OUTPUT_HANDLE - : WIN32_LITE_STD_ERROR_HANDLE), - static_cast<win32_lite::WORD>(new_color)); -#else - if (new_color != Normal) - { - uint8_t t = new_color & 0xF; - uint8_t b = (new_color & 0xF0) >> 4; - uint8_t tnum = 30 + ((t & 1) << 2 | (t & 2) | (t & 4) >> 2); - uint8_t bnum = 40 + ((b & 1) << 2 | (b & 2) | (b & 4) >> 2); - if (t & 8) - tnum += 60; - if (b & 8) - bnum += 60; - std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[%d;%dm", tnum, bnum); - } - else - { - std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[0m"); - } -#endif - saved_color() = new_color; - } - - text_color m_old; - console_buffer m_console; - static text_color& saved_color() - { - static text_color color = Normal; - return color; - } -}; - -template <text_color color, console_buffer console = ConsoleStdOutput> -struct console_color_tpl : public console_color -{ -public: - console_color_tpl() : console_color(color, console) {} - -private: -}; - -typedef console_color_tpl<DarkBlue> darkblue_text; -typedef console_color_tpl<DarkGreen> darkgreen_text; -typedef console_color_tpl<DarkCyan> darkcyan_text; -typedef console_color_tpl<DarkRed> darkred_text; -typedef console_color_tpl<DarkMagenta> darkmagenta_text; -typedef console_color_tpl<DarkYellow> darkyellow_text; -typedef console_color_tpl<LightGrey> lightgrey_text; -typedef console_color_tpl<Gray> gray_text; -typedef console_color_tpl<Blue> blue_text; -typedef console_color_tpl<Green> green_text; -typedef console_color_tpl<Cyan> cyan_text; -typedef console_color_tpl<Red> red_text; -typedef console_color_tpl<Magenta> magenta_text; -typedef console_color_tpl<Yellow> yellow_text; -typedef console_color_tpl<White> white_text; -} // namespace console_colors diff --git a/include/kfr/ext/double_double.hpp b/include/kfr/ext/double_double.hpp @@ -1,86 +0,0 @@ -#pragma once - -#include <cmath> - -struct double_double -{ - double hi, lo; - - constexpr double_double(double x) noexcept : hi(x), lo(0.0) {} - constexpr double_double(float x) noexcept : hi(x), lo(0.0) {} - constexpr double_double(double hi, double lo) noexcept : hi(hi + lo), lo((hi - (hi + lo)) + lo) {} - constexpr operator double() const noexcept { return hi + lo; } - constexpr operator float() const noexcept { return hi + lo; } - - constexpr friend double_double operator-(const double_double& x) noexcept { return { -x.hi, -x.lo }; } - constexpr friend double_double operator+(const double_double& x, const double_double& y) noexcept - { - const double sum = x.hi + y.hi; - return { sum, std::abs(x.hi) > std::abs(y.hi) ? (((x.hi - sum) + y.hi) + y.lo) + x.lo - : (((y.hi - sum) + x.hi) + x.lo) + y.lo }; - } - constexpr friend double_double operator-(const double_double& x, const double_double& y) noexcept - { - const double diff = x.hi - y.hi; - return { diff, std::abs(x.hi) > std::abs(y.hi) ? (((x.hi - diff) - y.hi) - y.lo) + x.lo - : (((-y.hi - diff) + x.hi) + x.lo) - y.lo }; - } - constexpr friend double_double operator*(const double_double& x, const double_double& y) noexcept - { - const double_double c = mul(x.hi, y.hi); - const double cc = (x.hi * y.lo + x.lo * y.hi) + c.lo; - return { c.hi, cc }; - } - constexpr friend double_double operator/(const double_double& x, const double_double& y) noexcept - { - const double c = x.hi / y.hi; - const double_double u = mul(c, y.hi); - const double cc = ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) / y.hi; - return { c, cc }; - } - -#if defined _MSC_VER && !defined __clang__ -#define DOUBLEDOUBLE_CONSTEXPR -#else -#define DOUBLEDOUBLE_CONSTEXPR constexpr -#endif - - DOUBLEDOUBLE_CONSTEXPR bool isinf() const noexcept { return std::isinf(hi); } - DOUBLEDOUBLE_CONSTEXPR bool isnan() const noexcept { return std::isnan(hi) || std::isnan(lo); } - - DOUBLEDOUBLE_CONSTEXPR double ulp(float value) const noexcept - { - if (std::isnan(value) && isnan()) - return 0.0; - if (std::isinf(value) && isinf() && (std::copysign(1.0f, value) == std::copysign(1.0, hi))) - return 0.0; - if (std::nexttoward(value, 0.0) == 0.0) - return 1.0; - return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0)); - } - DOUBLEDOUBLE_CONSTEXPR double ulp(double value) const noexcept - { - if (std::isnan(value) && isnan()) - return 0.0; - if (std::isinf(value) && isinf() && (std::copysign(1.0, value) == std::copysign(1.0, hi))) - return 0.0; - if (std::nexttoward(value, 0.0) == 0.0) - return 1.0; - return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0)); - } - -private: - constexpr static double_double splitprec(double x) noexcept - { - const double p = x * 1.34217729e8; - const double h = (x - p) + p; - return { h, x - h }; - } - constexpr static double_double mul(double x, double y) noexcept - { - const double_double xx = splitprec(x); - const double_double yy = splitprec(y); - const double z = x * y; - return { z, ((xx.hi * yy.hi - z) + xx.hi * yy.lo + xx.lo * yy.hi) + xx.lo * yy.lo }; - } -}; diff --git a/include/kfr/io/audiofile.hpp b/include/kfr/io/audiofile.hpp @@ -1,4 +1,4 @@ -/** @addtogroup io +/** @addtogroup audio_io * @{ */ /* @@ -28,8 +28,8 @@ #include "../base/basic_expressions.hpp" #include "../base/conversion.hpp" #include "../base/univector.hpp" -#include "../base/vec.hpp" #include "../cometa/ctti.hpp" +#include "../simd/vec.hpp" #include "file.hpp" #ifndef KFR_ENABLE_WAV @@ -64,10 +64,8 @@ struct audio_format struct audio_format_and_length : audio_format { using audio_format::audio_format; -#ifdef CMT_COMPILER_MSVC - audio_format_and_length() noexcept {} -#endif - audio_format_and_length(const audio_format& fmt) : audio_format(fmt) {} + constexpr audio_format_and_length() CMT_NOEXCEPT {} + constexpr audio_format_and_length(const audio_format& fmt) : audio_format(fmt) {} imax length = 0; // in samples }; @@ -95,39 +93,43 @@ struct audio_writer : public abstract_writer<T> virtual void close() = 0; }; -namespace internal +namespace internal_generic { #if KFR_ENABLE_WAV -static size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData, size_t bytesToWrite) +static inline size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData, + size_t bytesToWrite) { return file->write(pData, bytesToWrite); } -static drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset, drwav_seek_origin origin) +static inline drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset, + drwav_seek_origin origin) { return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current); } -static size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) +static inline size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) { return file->read(pBufferOut, bytesToRead); } -static drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset, drwav_seek_origin origin) +static inline drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset, + drwav_seek_origin origin) { return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current); } #endif #if KFR_ENABLE_FLAC -static size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) +static inline size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, + size_t bytesToRead) { return file->read(pBufferOut, bytesToRead); } -static drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset, - drflac_seek_origin origin) +static inline drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset, + drflac_seek_origin origin) { return file->seek(offset, origin == drflac_seek_origin_start ? seek_origin::begin : seek_origin::current); } #endif -} // namespace internal +} // namespace internal_generic #if KFR_ENABLE_WAV /// @brief WAV format writer @@ -139,17 +141,19 @@ struct audio_writer_wav : audio_writer<T> : writer(std::move(writer)), f(nullptr), fmt(fmt) { drwav_data_format wav_fmt; - wav_fmt.channels = fmt.channels; - wav_fmt.sampleRate = fmt.samplerate; + wav_fmt.channels = static_cast<drwav_uint32>(fmt.channels); + wav_fmt.sampleRate = static_cast<drwav_uint32>(fmt.samplerate); wav_fmt.format = fmt.type >= audio_sample_type::first_float ? DR_WAVE_FORMAT_IEEE_FLOAT : DR_WAVE_FORMAT_PCM; - wav_fmt.bitsPerSample = audio_sample_bit_depth(fmt.type); + wav_fmt.bitsPerSample = static_cast<drwav_uint32>(audio_sample_bit_depth(fmt.type)); wav_fmt.container = fmt.use_w64 ? drwav_container_w64 : drwav_container_riff; - f = drwav_open_write(&wav_fmt, (drwav_write_proc)&internal::drwav_writer_write_proc, - (drwav_seek_proc)&internal::drwav_writer_seek_proc, this->writer.get()); + f = drwav_open_write(&wav_fmt, (drwav_write_proc)&internal_generic::drwav_writer_write_proc, + (drwav_seek_proc)&internal_generic::drwav_writer_seek_proc, this->writer.get()); } ~audio_writer_wav() { close(); } + using audio_writer<T>::write; + /// @brief Write data to underlying binary writer size_t write(const T* data, size_t size) override { @@ -184,7 +188,7 @@ struct audio_writer_wav : audio_writer<T> imax tell() const override { return fmt.length; } - bool seek(imax position, seek_origin origin) override { return false; } + bool seek(imax, seek_origin) override { return false; } private: std::shared_ptr<abstract_writer<>> writer; @@ -199,8 +203,8 @@ struct audio_reader_wav : audio_reader<T> /// @brief Constructs WAV reader audio_reader_wav(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) { - f = drwav_open((drwav_read_proc)&internal::drwav_reader_read_proc, - (drwav_seek_proc)&internal::drwav_reader_seek_proc, this->reader.get()); + f = drwav_open((drwav_read_proc)&internal_generic::drwav_reader_read_proc, + (drwav_seek_proc)&internal_generic::drwav_reader_seek_proc, this->reader.get()); fmt.channels = f->channels; fmt.samplerate = f->sampleRate; fmt.length = f->totalSampleCount / fmt.channels; @@ -307,8 +311,8 @@ struct audio_reader_flac : audio_reader<T> /// @brief Constructs FLAC reader audio_reader_flac(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) { - f = drflac_open((drflac_read_proc)&internal::drflac_reader_read_proc, - (drflac_seek_proc)&internal::drflac_reader_seek_proc, this->reader.get()); + f = drflac_open((drflac_read_proc)&internal_generic::drflac_reader_read_proc, + (drflac_seek_proc)&internal_generic::drflac_reader_seek_proc, this->reader.get()); fmt.channels = f->channels; fmt.samplerate = f->sampleRate; fmt.length = f->totalSampleCount / fmt.channels; diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp @@ -1,4 +1,4 @@ -/** @addtogroup io +/** @addtogroup binary_io * @{ */ /* @@ -25,9 +25,9 @@ */ #pragma once -#include "../base/function.hpp" #include "../base/univector.hpp" -#include "../base/vec.hpp" +#include "../simd/impl/function.hpp" +#include "../simd/vec.hpp" #include <cstdio> #include <string> #include <vector> @@ -63,6 +63,7 @@ inline FILE* fopen_portable(const filepath_char* path, const filepath_char* mode #ifdef CMT_OS_WIN FILE* f = nullptr; errno_t e = _wfopen_s(&f, path, mode); + (void)e; return f; #else return fopen(path, mode); @@ -98,6 +99,14 @@ struct abstract_stream bool seek(imax offset, int origin) { return seek(offset, static_cast<seek_origin>(origin)); } }; +namespace internal_generic +{ +struct empty +{ +}; + +} // namespace internal_generic + /// @brief Base class for all typed readers template <typename T = void> struct abstract_reader : abstract_stream<T> @@ -117,6 +126,10 @@ struct abstract_reader : abstract_stream<T> this->read(result); return result; } + bool read(conditional<is_void<T>::value, internal_generic::empty, T>& data) + { + return read(&data, 1) == 1; + } }; /// @brief Base class for all typed writers @@ -131,6 +144,10 @@ struct abstract_writer : abstract_stream<T> return write(data.data(), data.size()); } size_t write(univector_ref<const T>&& data) { return write(data.data(), data.size()); } + bool write(const conditional<is_void<T>::value, internal_generic::empty, T>& data) + { + return write(&data, 1) == 1; + } }; template <typename From, typename To = void> @@ -207,6 +224,8 @@ struct file_reader : abstract_reader<T> ~file_reader() override {} size_t read(T* data, size_t size) final { return fread(data, element_size<T>(), size, handle.file); } + using abstract_reader<T>::read; + imax tell() const final { return IO_TELL_64(handle.file); } bool seek(imax offset, seek_origin origin) final { @@ -221,6 +240,8 @@ struct file_writer : abstract_writer<T> { file_writer(file_handle&& handle) : handle(std::move(handle)) {} ~file_writer() override {} + + using abstract_writer<T>::write; size_t write(const T* data, size_t size) final { return fwrite(data, element_size<T>(), size, handle.file); diff --git a/include/kfr/io/impl/audiofile-impl.cpp b/include/kfr/io/impl/audiofile-impl.cpp @@ -25,6 +25,8 @@ */ #include "../audiofile.hpp" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wimplicit-fallthrough") #if defined(KFR_ENABLE_WAV) && KFR_ENABLE_WAV #define DR_WAV_NO_STDIO @@ -37,3 +39,5 @@ #define DR_FLAC_NO_STDIO #include "../dr/dr_flac.h" #endif + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/io/python_plot.hpp b/include/kfr/io/python_plot.hpp @@ -1,4 +1,4 @@ -/** @addtogroup io +/** @addtogroup plotting * @{ */ /* @@ -24,8 +24,8 @@ See https://www.kfrlib.com for details. */ #pragma once -#include "../base/vec.hpp" #include "../cometa/string.hpp" +#include "../simd/vec.hpp" #include <cstdlib> #ifdef CMT_OS_WIN @@ -38,7 +38,7 @@ namespace kfr { -namespace internal +namespace internal_generic { CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wdeprecated-declarations") @@ -51,7 +51,7 @@ void python(const std::string& name, const std::string& code) std::string filename; { char curdir[1024]; - cross_getcwd(curdir, arraysize(curdir)); + (void)cross_getcwd(curdir, arraysize(curdir)); filename = curdir; } #ifdef CMT_OS_WIN @@ -64,7 +64,7 @@ void python(const std::string& name, const std::string& code) FILE* f = fopen(filename.c_str(), "w"); fwrite(code.c_str(), 1, code.size(), f); fclose(f); - std::system(("python \"" + filename + "\"").c_str()); + (void)std::system(("python \"" + filename + "\"").c_str()); } CMT_PRAGMA_GNU(GCC diagnostic pop) @@ -78,7 +78,7 @@ inline T flush_to_zero(T value) { return static_cast<double>(value); } -} // namespace internal +} // namespace internal_generic inline std::string concat_args() { return {}; } @@ -106,7 +106,7 @@ void plot_show(const std::string& name, const std::string& wavfile, const std::s std::string ss; ss += python_prologue() + "dspplot.plot(" + concat_args("r'" + wavfile + "'", options) + ")\n"; - internal::python(name, ss); + internal_generic::python(name, ss); print("done\n"); } @@ -125,12 +125,12 @@ void plot_show(const std::string& name, const T& x, const std::string& options = std::string ss; ss += python_prologue() + "data = [\n"; for (size_t i = 0; i < array.size(); i++) - ss += as_string(fmt<'g', 20, 17>(internal::flush_to_zero(array[i])), ",\n"); + ss += as_string(fmt<'g', 20, 17>(internal_generic::flush_to_zero(array[i])), ",\n"); ss += "]\n"; ss += "dspplot.plot(" + concat_args("data", options) + ")\n"; - internal::python(name, ss); + internal_generic::python(name, ss); print("done\n"); } @@ -170,7 +170,7 @@ void perfplot_show(const std::string& name, T1&& data, T2&& labels, const std::s ss += "dspplot.perfplot(" + concat_args("data, labels", options) + ")\n"; - internal::python(name, ss); + internal_generic::python(name, ss); print("done\n"); } diff --git a/include/kfr/io/tostring.hpp b/include/kfr/io/tostring.hpp @@ -1,4 +1,4 @@ -/** @addtogroup io +/** @addtogroup string_io * @{ */ /* @@ -25,15 +25,50 @@ */ #pragma once -#include "../base/complex.hpp" #include "../base/univector.hpp" -#include "../base/vec.hpp" #include "../cometa/string.hpp" +#include "../simd/complex.hpp" +#include "../simd/vec.hpp" #include <cmath> namespace cometa { +template <> +struct representation<cometa::special_value> +{ + using type = std::string; + static std::string get(const cometa::special_value& value) + { + using cometa::special_constant; + switch (value.c) + { + case special_constant::undefined: + return "undefined"; + case special_constant::default_constructed: + return "default_constructed"; + case special_constant::infinity: + return "infinity"; + case special_constant::neg_infinity: + return "neg_infinity"; + case special_constant::min: + return "min"; + case special_constant::max: + return "max"; + case special_constant::neg_max: + return "neg_max"; + case special_constant::lowest: + return "lowest"; + case special_constant::integer: + return as_string(value.ll); + case special_constant::floating_point: + return as_string(value.d); + default: + return "unknown"; + } + } +}; + namespace details { @@ -157,10 +192,21 @@ struct representation<kfr::univector<T, Tag>> return details::array_to_string(value.data(), value.size()); } }; +template <typename T, size_t Size> +struct representation<std::array<T, Size>> +{ + using type = std::string; + static std::string get(const std::array<T, Size>& value) + { + return details::array_to_string(value.data(), value.size()); + } +}; } // namespace cometa namespace kfr { +inline namespace CMT_ARCH_NAME +{ namespace internal { @@ -205,6 +251,7 @@ inline internal::expression_printer printer() { return internal::expression_prin /// @brief Returns an output expression that prints the values with their types (used for debug) inline internal::expression_debug_printer debug_printer() { return internal::expression_debug_printer(); } +} // namespace CMT_ARCH_NAME /// @brief Converts dB value to string (uses oo for infinity symbol) template <typename T> diff --git a/include/kfr/kfr.h b/include/kfr/kfr.h @@ -0,0 +1,70 @@ +/** @addtogroup utility + * @{ + */ +#pragma once + +#include <stddef.h> +#include <stdint.h> + +#include "cident.h" + +#define KFR_VERSION_MAJOR 3 +#define KFR_VERSION_MINOR 0 +#define KFR_VERSION_PATCH 5 +#define KFR_VERSION_LABEL "rc" + +#define KFR_VERSION_STRING \ + CMT_STRINGIFY(KFR_VERSION_MAJOR) \ + "." CMT_STRINGIFY(KFR_VERSION_MINOR) "." CMT_STRINGIFY(KFR_VERSION_PATCH) "-" KFR_VERSION_LABEL +#define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_PATCH) + +#if defined DEBUG || defined KFR_DEBUG +#define KFR_DEBUG_STR " debug" +#elif defined NDEBUG || defined KFR_NDEBUG +#define KFR_DEBUG_STR " optimized" +#else +#define KFR_DEBUG_STR "" +#endif + +#define KFR_NATIVE_INTRINSICS 1 + +#if defined CMT_COMPILER_CLANG && !defined CMT_DISABLE_CLANG_EXT +#define CMT_CLANG_EXT +#endif + +#ifdef KFR_NATIVE_INTRINSICS +#define KFR_BUILD_DETAILS_1 " +in" +#else +#define KFR_BUILD_DETAILS_1 "" +#endif + +#ifdef CMT_CLANG_EXT +#define KFR_BUILD_DETAILS_2 " +ve" +#else +#define KFR_BUILD_DETAILS_2 "" +#endif + +#define KFR_VERSION_FULL \ + "KFR " KFR_VERSION_STRING KFR_DEBUG_STR \ + " " CMT_STRINGIFY(CMT_ARCH_NAME) " " CMT_ARCH_BITNESS_NAME " (" CMT_COMPILER_FULL_NAME "/" CMT_OS_NAME \ + ")" KFR_BUILD_DETAILS_1 KFR_BUILD_DETAILS_2 + +#ifdef __cplusplus +namespace kfr +{ +/// @brief KFR version string +constexpr const char version_string[] = KFR_VERSION_STRING; + +constexpr int version_major = KFR_VERSION_MAJOR; +constexpr int version_minor = KFR_VERSION_MINOR; +constexpr int version_patch = KFR_VERSION_PATCH; +constexpr int version = KFR_VERSION; + +/// @brief KFR version string including architecture and compiler name +constexpr const char version_full[] = KFR_VERSION_FULL; +} // namespace kfr +#endif + +#define KFR_INTRINSIC CMT_INTRINSIC +#define KFR_MEM_INTRINSIC CMT_MEM_INTRINSIC +#define KFR_FUNCTION CMT_FUNCTION diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp @@ -22,4 +22,24 @@ */ #pragma once -#include "base.hpp" +#include "simd.hpp" + +#include "math/abs.hpp" +#include "math/asin_acos.hpp" +#include "math/atan.hpp" +#include "math/clamp.hpp" +#include "math/compiletime.hpp" +#include "math/complex_math.hpp" +#include "math/gamma.hpp" +#include "math/hyperbolic.hpp" +#include "math/interpolation.hpp" +#include "math/log_exp.hpp" +#include "math/logical.hpp" +#include "math/min_max.hpp" +#include "math/modzerobessel.hpp" +#include "math/round.hpp" +#include "math/saturation.hpp" +#include "math/select.hpp" +#include "math/sin_cos.hpp" +#include "math/sqrt.hpp" +#include "math/tan.hpp" diff --git a/include/kfr/math/abs.hpp b/include/kfr/math/abs.hpp @@ -0,0 +1,54 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/abs.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns the absolute value of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 abs(const T1& x) +{ + return intrinsics::abs(x); +} + +/** + * @brief Returns template expression that returns the absolute value of x. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::abs, E1> abs(E1&& x) +{ + return { fn::abs(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/include/kfr/math/asin_acos.hpp b/include/kfr/math/asin_acos.hpp @@ -0,0 +1,71 @@ +/** @addtogroup trigonometric + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/asin_acos.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns the arc sine of x. The returned angle is in the range \f$-\pi/2\f$ through \f$\pi/2\f$. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC flt_type<T1> asin(const T1& x) +{ + return intrinsics::asin(x); +} + +/** + * @brief Returns template expression that returns the arc sine of x. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::asin, E1> asin(E1&& x) +{ + return { fn::asin(), std::forward<E1>(x) }; +} +/** + * @brief Returns the arc cosine of x. The returned angle is in the range 0 through \f$\pi\f$. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC flt_type<T1> acos(const T1& x) +{ + return intrinsics::acos(x); +} + +/** + * @brief Returns template expression that returns the arc cosine of x. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::acos, E1> acos(E1&& x) +{ + return { fn::acos(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/include/kfr/math/atan.hpp b/include/kfr/math/atan.hpp @@ -0,0 +1,110 @@ +/** @addtogroup trigonometric + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/atan.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns the arc tangent of x. The returned angle is in the range \f$-\pi/2\f$ through + * \f$\pi/2\f$. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> atan(const T1& x) +{ + return intrinsics::atan(x); +} + +/** + * @brief Returns template expression that returns the arc tangent of x. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::atan, E1> atan(E1&& x) +{ + return { fn::atan(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the arc tangent of the x, expressed in degrees. The returned angle is in the range -90 + * through 90. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> atandeg(const T1& x) +{ + return intrinsics::atandeg(x); +} + +/** + * @brief Returns template expression that returns the arc tangent of the x, expressed in degrees. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::atandeg, E1> atandeg(E1&& x) +{ + return { fn::atandeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the arc tangent of y/x using the signs of arguments to determine the correct quadrant. + */ +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> +KFR_FUNCTION common_type<T1, T2> atan2(const T1& x, const T2& y) +{ + return intrinsics::atan2(x, y); +} + +/** + * @brief Returns template expression that returns the arc tangent of y/x. + */ +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_FUNCTION internal::expression_function<fn::atan2, E1, E2> atan2(E1&& x, E2&& y) +{ + return { fn::atan2(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/** + * @brief Returns the arc tangent of y/x (expressed in degrees) using the signs of arguments to determine the + * correct quadrant. + */ +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> +KFR_FUNCTION common_type<T1, T2> atan2deg(const T1& x, const T2& y) +{ + return intrinsics::atan2deg(x, y); +} + +/** + * @brief Returns template expression that returns the arc tangent of y/x (expressed in degrees). + */ +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_FUNCTION internal::expression_function<fn::atan2deg, E1, E2> atan2deg(E1&& x, E2&& y) +{ + return { fn::atan2deg(), std::forward<E1>(x), std::forward<E2>(y) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/clamp.hpp b/include/kfr/math/clamp.hpp @@ -0,0 +1,65 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/clamp.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Returns the first argument clamped to a range [lo, hi] +template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value), + typename Tout = common_type<T1, T2, T3>> +KFR_INTRINSIC Tout clamp(const T1& x, const T2& lo, const T3& hi) +{ + return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi)); +} + +/// @brief Creates an expression that returns the first argument clamped to a range [lo, hi] +template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> +KFR_INTRINSIC internal::expression_function<fn::clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi) +{ + return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) }; +} + +/// @brief Returns the first argument clamped to a range [0, hi] +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout clamp(const T1& x, const T2& hi) +{ + return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(hi)); +} + +/// @brief Creates an expression that returns the first argument clamped to a range [0, hi] +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::clamp, E1, E2> clamp(E1&& x, E2&& hi) +{ + return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(hi) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/compiletime.hpp b/include/kfr/math/compiletime.hpp @@ -0,0 +1,84 @@ +/** @addtogroup math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once +#include "../simd/constants.hpp" +#include "../simd/operators.hpp" +#include "../simd/types.hpp" + +namespace kfr +{ + +namespace compiletime +{ + +template <typename T> +constexpr inline T select(bool c, T x, T y) +{ + return c ? x : y; +} +template <typename T> +constexpr inline T trunc(T x) +{ + return static_cast<T>(static_cast<long long>(x)); +} +template <typename T> +constexpr inline T abs(T x) +{ + return x < T() ? -x : x; +} +template <typename T> +constexpr inline T mulsign(T x, T y) +{ + return y < T() ? -x : x; +} +template <typename T> +constexpr inline T sin(T x) +{ + x = x - trunc(x / c_pi<T, 2>) * c_pi<T, 2>; + constexpr T c2 = -0.16665853559970855712890625; + constexpr T c4 = +8.31427983939647674560546875e-3; + constexpr T c6 = -1.85423981747590005397796630859375e-4; + + x -= c_pi<T>; + T y = abs(x); + y = select(y > c_pi<T, 1, 2>, c_pi<T> - y, y); + y = mulsign(y, -x); + + const T y2 = y * y; + T formula = c6; + const T y3 = y2 * y; + formula = fmadd(formula, y2, c4); + formula = fmadd(formula, y2, c2); + formula = formula * y3 + y; + return formula; +} +template <typename T> +constexpr inline T cos(T x) +{ + return sin(x + c_pi<T, 1, 2>); +} +} // namespace compiletime +} // namespace kfr diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp @@ -0,0 +1,410 @@ +/** @addtogroup complex + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../simd/complex.hpp" +#include "abs.hpp" +#include "atan.hpp" +#include "hyperbolic.hpp" +#include "log_exp.hpp" +#include "min_max.hpp" +#include "select.hpp" +#include "sin_cos.hpp" +#include "sqrt.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> csin(const vec<complex<T>, N>& x) +{ + return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> csinh(const vec<complex<T>, N>& x) +{ + return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x)))); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> ccos(const vec<complex<T>, N>& x) +{ + return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))))); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x) +{ + return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x)))); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> cabs(const vec<complex<T>, N>& x) +{ + const vec<T, N* 2> xx = sqr(cdecom(x)); + return sqrt(even(xx) + odd(xx)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> carg(const vec<complex<T>, N>& x) +{ + const vec<T, N* 2> xx = cdecom(x); + return atan2(even(xx), odd(xx)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> clog(const vec<complex<T>, N>& x) +{ + return make_complex(log(cabs(x)), carg(x)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> clog2(const vec<complex<T>, N>& x) +{ + return clog(x) * avoid_odr_use(c_recip_log_2<T>); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> clog10(const vec<complex<T>, N>& x) +{ + return clog(x) * avoid_odr_use(c_recip_log_10<T>); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cexp(const vec<complex<T>, N>& x) +{ + return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x)))); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x) +{ + return cexp(x * avoid_odr_use(c_log_2<T>)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x) +{ + return cexp(x * avoid_odr_use(c_log_10<T>)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> polar(const vec<complex<T>, N>& x) +{ + return make_complex(cabs(x), carg(x)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x) +{ + return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x)))); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> cabsdup(const vec<T, N>& x) +{ + x = sqr(x); + return sqrt(x + swap<2>(x)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x) +{ + const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5); + return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t)); +} + +KFR_HANDLE_SCALAR(cconj) +KFR_HANDLE_SCALAR(csin) +KFR_HANDLE_SCALAR(csinh) +KFR_HANDLE_SCALAR(ccos) +KFR_HANDLE_SCALAR(ccosh) +KFR_HANDLE_SCALAR(clog) +KFR_HANDLE_SCALAR(clog2) +KFR_HANDLE_SCALAR(clog10) +KFR_HANDLE_SCALAR(cexp) +KFR_HANDLE_SCALAR(cexp2) +KFR_HANDLE_SCALAR(cexp10) +KFR_HANDLE_SCALAR(polar) +KFR_HANDLE_SCALAR(cartesian) +KFR_HANDLE_SCALAR(csqrt) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> cabs(const vec<T, N>& a) +{ + return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a))); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> carg(const vec<T, N>& a) +{ + return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a))); +} +template <typename T1> +KFR_INTRINSIC realtype<T1> cabs(const T1& a) +{ + using vecout = vec1<T1>; + return to_scalar(intrinsics::cabs(vecout(a))); +} +template <typename T1> +KFR_INTRINSIC realtype<T1> carg(const T1& a) +{ + using vecout = vec1<T1>; + return to_scalar(intrinsics::carg(vecout(a))); +} +} // namespace intrinsics + +KFR_I_FN(cconj) +KFR_I_FN(csin) +KFR_I_FN(csinh) +KFR_I_FN(ccos) +KFR_I_FN(ccosh) +KFR_I_FN(cabs) +KFR_I_FN(carg) +KFR_I_FN(clog) +KFR_I_FN(clog2) +KFR_I_FN(clog10) +KFR_I_FN(cexp) +KFR_I_FN(cexp2) +KFR_I_FN(cexp10) +KFR_I_FN(polar) +KFR_I_FN(cartesian) +KFR_I_FN(csqrt) + +/// @brief Returns the sine of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 csin(const T1& x) +{ + return intrinsics::csin(x); +} + +/// @brief Returns template expression that returns the sine of the the complex value x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::csin, E1> csin(E1&& x) +{ + return { fn::csin(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic sine of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 csinh(const T1& x) +{ + return intrinsics::csinh(x); +} + +/// @brief Returns template expression that returns the hyperbolic sine of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::csinh, E1> csinh(E1&& x) +{ + return { fn::csinh(), std::forward<E1>(x) }; +} + +/// @brief Returns the cosine of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 ccos(const T1& x) +{ + return intrinsics::ccos(x); +} + +/// @brief Returns template expression that returns the cosine of the the complex value x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::ccos, E1> ccos(E1&& x) +{ + return { fn::ccos(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic cosine of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 ccosh(const T1& x) +{ + return intrinsics::ccosh(x); +} + +/// @brief Returns template expression that returns the hyperbolic cosine of the the complex value x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::ccosh, E1> ccosh(E1&& x) +{ + return { fn::ccosh(), std::forward<E1>(x) }; +} + +/// @brief Returns the absolute value (magnitude) of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION realtype<T1> cabs(const T1& x) +{ + return intrinsics::cabs(x); +} + +/// @brief Returns template expression that returns the absolute value (magnitude) of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cabs, E1> cabs(E1&& x) +{ + return { fn::cabs(), std::forward<E1>(x) }; +} + +/// @brief Returns the phase angle (argument) of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION realtype<T1> carg(const T1& x) +{ + return intrinsics::carg(x); +} + +/// @brief Returns template expression that returns the phase angle (argument) of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::carg, E1> carg(E1&& x) +{ + return { fn::carg(), std::forward<E1>(x) }; +} + +/// @brief Returns template expression that returns the complex conjugate of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cconj, E1> cconj(E1&& x) +{ + return { fn::cconj(), std::forward<E1>(x) }; +} + +/// @brief Returns the natural logarithm of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 clog(const T1& x) +{ + return intrinsics::clog(x); +} + +/// @brief Returns template expression that returns the natural logarithm of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::clog, E1> clog(E1&& x) +{ + return { fn::clog(), std::forward<E1>(x) }; +} + +/// @brief Returns the binary (base-2) logarithm of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 clog2(const T1& x) +{ + return intrinsics::clog2(x); +} + +/// @brief Returns template expression that returns the binary (base-2) logarithm of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::clog2, E1> clog2(E1&& x) +{ + return { fn::clog2(), std::forward<E1>(x) }; +} + +/// @brief Returns the common (base-10) logarithm of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 clog10(const T1& x) +{ + return intrinsics::clog10(x); +} + +/// @brief Returns template expression that returns the common (base-10) logarithm of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::clog10, E1> clog10(E1&& x) +{ + return { fn::clog10(), std::forward<E1>(x) }; +} + +/// @brief Returns \f$e\f$ raised to the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 cexp(const T1& x) +{ + return intrinsics::cexp(x); +} + +/// @brief Returns template expression that returns \f$e\f$ raised to the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cexp, E1> cexp(E1&& x) +{ + return { fn::cexp(), std::forward<E1>(x) }; +} + +/// @brief Returns 2 raised to the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 cexp2(const T1& x) +{ + return intrinsics::cexp2(x); +} + +/// @brief Returns template expression that returns 2 raised to the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cexp2, E1> cexp2(E1&& x) +{ + return { fn::cexp2(), std::forward<E1>(x) }; +} + +/// @brief Returns 10 raised to the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 cexp10(const T1& x) +{ + return intrinsics::cexp10(x); +} + +/// @brief Returns template expression that returns 10 raised to the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cexp10, E1> cexp10(E1&& x) +{ + return { fn::cexp10(), std::forward<E1>(x) }; +} + +/// @brief Converts complex number to polar +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 polar(const T1& x) +{ + return intrinsics::polar(x); +} + +/// @brief Returns template expression that converts complex number to polar +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::polar, E1> polar(E1&& x) +{ + return { fn::polar(), std::forward<E1>(x) }; +} + +/// @brief Converts complex number to cartesian +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 cartesian(const T1& x) +{ + return intrinsics::cartesian(x); +} + +/// @brief Returns template expression that converts complex number to cartesian +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cartesian, E1> cartesian(E1&& x) +{ + return { fn::cartesian(), std::forward<E1>(x) }; +} + +/// @brief Returns square root of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 csqrt(const T1& x) +{ + return intrinsics::csqrt(x); +} + +/// @brief Returns template expression that returns square root of the complex number x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::csqrt, E1> csqrt(E1&& x) +{ + return { fn::csqrt(), std::forward<E1>(x) }; +} + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/gamma.hpp b/include/kfr/math/gamma.hpp @@ -0,0 +1,63 @@ +/** @addtogroup other_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/gamma.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Returns the approximate gamma function of an argument +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> gamma(const T1& x) +{ + return intrinsics::gamma(x); +} + +/// @brief Creates expression that returns the approximate gamma function of an argument +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::gamma, E1> gamma(E1&& x) +{ + return { fn::gamma(), std::forward<E1>(x) }; +} + +/// @brief Returns the approximate factorial of an argument +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> factorial_approx(const T1& x) +{ + return intrinsics::factorial_approx(x); +} + +/// @brief Creates expression that returns the approximate factorial of an argument +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::factorial_approx, E1> factorial_approx(E1&& x) +{ + return { fn::factorial_approx(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/hyperbolic.hpp b/include/kfr/math/hyperbolic.hpp @@ -0,0 +1,123 @@ +/** @addtogroup hyperbolic + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/hyperbolic.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Returns the hyperbolic sine of the x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sinh(const T1& x) +{ + return intrinsics::sinh(x); +} + +/// @brief Returns template expression that returns the hyperbolic sine of the x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sinh, E1> sinh(E1&& x) +{ + return { fn::sinh(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic cosine of the x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> cosh(const T1& x) +{ + return intrinsics::cosh(x); +} + +/// @brief Returns template expression that returns the hyperbolic cosine of the x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cosh, E1> cosh(E1&& x) +{ + return { fn::cosh(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic tangent of the x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> tanh(const T1& x) +{ + return intrinsics::tanh(x); +} + +/// @brief Returns template expression that returns the hyperbolic tangent of the x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::tanh, E1> tanh(E1&& x) +{ + return { fn::tanh(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic cotangent of the x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> coth(const T1& x) +{ + return intrinsics::coth(x); +} + +/// @brief Returns template expression that returns the hyperbolic cotangent of the x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::coth, E1> coth(E1&& x) +{ + return { fn::coth(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic sine of the even elements of the x and the hyperbolic cosine of the odd +/// elements of the x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sinhcosh(const T1& x) +{ + return intrinsics::sinhcosh(x); +} + +/// @brief Returns template expression that returns the hyperbolic sine of the even elements of the x and the +/// hyperbolic cosine of the odd elements of the x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sinhcosh, E1> sinhcosh(E1&& x) +{ + return { fn::sinhcosh(), std::forward<E1>(x) }; +} + +/// @brief Returns the hyperbolic cosine of the even elements of the x and the hyperbolic sine of the odd +/// elements of the x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> coshsinh(const T1& x) +{ + return intrinsics::coshsinh(x); +} + +/// @brief Returns template expression that returns the hyperbolic cosine of the even elements of the x and +/// the hyperbolic sine of the odd elements of the x +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::coshsinh, E1> coshsinh(E1&& x) +{ + return { fn::coshsinh(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/abs.hpp b/include/kfr/math/impl/abs.hpp @@ -0,0 +1,138 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/select.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS + +// floating point +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT +{ + return x & avoid_odr_use(special_constants<T>::invhighbitmask()); +} + +KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT +{ + const __m128i sh = _mm_srai_epi32(x.v, 31); + const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1)); + return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk); +} +KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); } +KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); } +KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); } +KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; } + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT +{ + const __m256i sh = _mm256_srai_epi32(x.v, 31); + const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1)); + return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk); +} +KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); } +KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); } +KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); } +KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; } +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); } +KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); } +KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); } +KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); } +KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; } +#endif + +KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>::value) + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); } +KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); } +KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); } +#if defined CMT_ARCH_NEON64 +KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); } +#else +KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); } +#endif + +KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; } + +KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); } +#if defined CMT_ARCH_NEON64 +KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); } +#else +KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT +{ + return x & avoid_odr_use(special_constants<f64>::invhighbitmask()); +} +#endif + +KFR_HANDLE_ALL_SIZES_1(abs) + +#else + +// floating point +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT +{ + return x & avoid_odr_use(special_constants<T>::invhighbitmask()); +} + +// fallback +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT +{ + return select(x >= T(0), x, -x); +} +#endif +KFR_HANDLE_SCALAR(abs) +} // namespace intrinsics + +KFR_I_FN(abs) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/asin_acos.hpp b/include/kfr/math/impl/asin_acos.hpp @@ -0,0 +1,58 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/atan.hpp" +#include "../../math/select.hpp" +#include "../../math/sqrt.hpp" +#include "../../simd/impl/function.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> asin(const vec<T, N>& x) +{ + const vec<Tout, N> xx = x; + return atan2(xx, sqrt(Tout(1) - xx * xx)); +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> acos(const vec<T, N>& x) +{ + const vec<Tout, N> xx = x; + return atan2(sqrt(Tout(1) - xx * xx), xx); +} +KFR_HANDLE_SCALAR(asin) +KFR_HANDLE_SCALAR(acos) +} // namespace intrinsics +KFR_I_FN(asin) +KFR_I_FN(acos) +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/include/kfr/math/impl/atan.hpp b/include/kfr/math/impl/atan.hpp @@ -0,0 +1,230 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once +#include "../../math/abs.hpp" +#include "../../math/select.hpp" +#include "../../math/sin_cos.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ +template <size_t N> +KFR_INTRINSIC vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx) +{ + vec<f32, N> x = xx, y = yy; + vec<f32, N> s, t, u; + vec<i32, N> q; + q = select(x < 0, -2, 0); + x = select(x < 0, -x, x); + mask<i32, N> m; + m = y > x; + t = x; + x = select(m, y, x); + y = select(m, -t, y); + q = select(m, q + 1, q); + s = y / x; + t = s * s; + u = 0.00282363896258175373077393f; + u = fmadd(u, t, -0.0159569028764963150024414f); + u = fmadd(u, t, 0.0425049886107444763183594f); + u = fmadd(u, t, -0.0748900920152664184570312f); + u = fmadd(u, t, 0.106347933411598205566406f); + u = fmadd(u, t, -0.142027363181114196777344f); + u = fmadd(u, t, 0.199926957488059997558594f); + u = fmadd(u, t, -0.333331018686294555664062f); + t = u * t * s + s; + t = innercast<f32>(q) * 1.5707963267948966192313216916398f + t; + return t; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx) +{ + vec<f64, N> x = xx, y = yy; + vec<f64, N> s, t, u; + vec<i64, N> q; + q = select(x < 0, i64(-2), i64(0)); + x = select(x < 0, -x, x); + mask<i64, N> m; + m = y > x; + t = x; + x = select(m, y, x); + y = select(m, -t, y); + q = select(m, q + i64(1), q); + s = y / x; + t = s * s; + u = -1.88796008463073496563746e-05; + u = fmadd(u, t, 0.000209850076645816976906797); + u = fmadd(u, t, -0.00110611831486672482563471); + u = fmadd(u, t, 0.00370026744188713119232403); + u = fmadd(u, t, -0.00889896195887655491740809); + u = fmadd(u, t, 0.016599329773529201970117); + u = fmadd(u, t, -0.0254517624932312641616861); + u = fmadd(u, t, 0.0337852580001353069993897); + u = fmadd(u, t, -0.0407629191276836500001934); + u = fmadd(u, t, 0.0466667150077840625632675); + u = fmadd(u, t, -0.0523674852303482457616113); + u = fmadd(u, t, 0.0587666392926673580854313); + u = fmadd(u, t, -0.0666573579361080525984562); + u = fmadd(u, t, 0.0769219538311769618355029); + u = fmadd(u, t, -0.090908995008245008229153); + u = fmadd(u, t, 0.111111105648261418443745); + u = fmadd(u, t, -0.14285714266771329383765); + u = fmadd(u, t, 0.199999999996591265594148); + u = fmadd(u, t, -0.333333333333311110369124); + t = u * t * s + s; + t = innercast<f64>(q) * 1.5707963267948966192313216916398 + t; + return t; +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x) +{ + vec<f32, N> r = atan2k(abs(y), x); + constexpr f32 pi = 3.1415926535897932384626433832795f; + constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f; + constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f; + r = mulsign(r, x); + r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r); + r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r); + r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r); + r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y); + return r; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x) +{ + vec<f64, N> r = atan2k(abs(y), x); + constexpr f64 pi = 3.1415926535897932384626433832795; + constexpr f64 pi_over_2 = 1.5707963267948966192313216916398; + constexpr f64 pi_over_4 = 0.78539816339744830961566084581988; + r = mulsign(r, x); + r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r); + r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r); + r = select(y == 0.0, select(x < 0., pi, 0.), r); + r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y); + return r; +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> atan(const vec<f32, N>& x) +{ + vec<f32, N> t, u; + vec<i32, N> q; + q = select(x < 0.f, 2, 0); + vec<f32, N> s = select(x < 0.f, -x, x); + q = select(s > 1.f, q | 1, q); + s = select(s > 1.f, 1.0f / s, s); + t = s * s; + u = 0.00282363896258175373077393f; + u = fmadd(u, t, -0.0159569028764963150024414f); + u = fmadd(u, t, 0.0425049886107444763183594f); + u = fmadd(u, t, -0.0748900920152664184570312f); + u = fmadd(u, t, 0.106347933411598205566406f); + u = fmadd(u, t, -0.142027363181114196777344f); + u = fmadd(u, t, 0.199926957488059997558594f); + u = fmadd(u, t, -0.333331018686294555664062f); + t = s + s * (t * u); + t = select((q & 1) != 0, 1.570796326794896557998982f - t, t); + t = select((q & 2) != 0, -t, t); + return t; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> atan(const vec<f64, N>& x) +{ + vec<f64, N> t, u; + vec<i64, N> q; + q = select(x < 0.0, i64(2), i64(0)); + vec<f64, N> s = select(x < 0.0, -x, x); + q = select(s > 1.0, q | 1, q); + s = select(s > 1.0, 1.0 / s, s); + t = s * s; + u = -1.88796008463073496563746e-05; + u = fmadd(u, t, 0.000209850076645816976906797); + u = fmadd(u, t, -0.00110611831486672482563471); + u = fmadd(u, t, 0.00370026744188713119232403); + u = fmadd(u, t, -0.00889896195887655491740809); + u = fmadd(u, t, 0.016599329773529201970117); + u = fmadd(u, t, -0.0254517624932312641616861); + u = fmadd(u, t, 0.0337852580001353069993897); + u = fmadd(u, t, -0.0407629191276836500001934); + u = fmadd(u, t, 0.0466667150077840625632675); + u = fmadd(u, t, -0.0523674852303482457616113); + u = fmadd(u, t, 0.0587666392926673580854313); + u = fmadd(u, t, -0.0666573579361080525984562); + u = fmadd(u, t, 0.0769219538311769618355029); + u = fmadd(u, t, -0.090908995008245008229153); + u = fmadd(u, t, 0.111111105648261418443745); + u = fmadd(u, t, -0.14285714266771329383765); + u = fmadd(u, t, 0.199999999996591265594148); + u = fmadd(u, t, -0.333333333333311110369124); + t = s + s * (t * u); + t = select((q & 1) != 0, 1.570796326794896557998982 - t, t); + t = select((q & 2) != 0, -t, t); + return t; +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> atandeg(const vec<f32, N>& x) +{ + return atan(x) * c_radtodeg<f32>; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> atandeg(const vec<f64, N>& x) +{ + return atan(x) * c_radtodeg<f64>; +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> atan2deg(const vec<f32, N>& y, const vec<f32, N>& x) +{ + return atan2(y, x) * c_radtodeg<f32>; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> atan2deg(const vec<f64, N>& y, const vec<f64, N>& x) +{ + return atan2(y, x) * c_radtodeg<f64>; +} + +KFR_HANDLE_SCALAR(atan) +KFR_HANDLE_SCALAR(atan2) +KFR_HANDLE_SCALAR(atandeg) +KFR_HANDLE_SCALAR(atan2deg) +} // namespace intrinsics +KFR_I_FN(atan) +KFR_I_FN(atandeg) +KFR_I_FN(atan2) +KFR_I_FN(atan2deg) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/clamp.hpp b/include/kfr/math/impl/clamp.hpp @@ -0,0 +1,55 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/min_max.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T> +KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi) +{ + return max(min(x, hi), lo); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi) +{ + return max(min(x, hi), lo); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi) +{ + return max(min(x, hi), zerovector<T, N>()); +} +} // namespace intrinsics +KFR_I_FN(clamp) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/gamma.hpp b/include/kfr/math/impl/gamma.hpp @@ -0,0 +1,71 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once +#include "../../math/log_exp.hpp" +#include "../../simd/impl/function.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wc99-extensions") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") +#endif + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ +template <typename T> +constexpr T gamma_precalc[] = { + 0x2.81b263fec4e08p+0, 0x3.07b4100e04448p+16, -0xa.a0da01d4d4e2p+16, 0xf.05ccb27bb9dbp+16, + -0xa.fa79616b7c6ep+16, 0x4.6dd6c10d4df5p+16, -0xf.a2304199eb4ap+12, 0x1.c21dd4aade3dp+12, + -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0, -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24, +}; + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> gamma(const vec<T, N>& z) +{ + constexpr size_t Count = arraysize(gamma_precalc<T>); + vec<T, N> accm = gamma_precalc<T>[0]; + CMT_LOOP_UNROLL + for (size_t k = 1; k < Count; k++) + accm += gamma_precalc<T>[k] / (z + innercast<utype<T>>(k)); + accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5); + return accm / z; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> factorial_approx(const vec<T, N>& x) +{ + return gamma(x + T(1)); +} +KFR_HANDLE_SCALAR(gamma) +KFR_HANDLE_SCALAR(factorial_approx) +} // namespace intrinsics +KFR_I_FN(gamma) +KFR_I_FN(factorial_approx) +} // namespace CMT_ARCH_NAME +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/math/impl/hyperbolic.hpp b/include/kfr/math/impl/hyperbolic.hpp @@ -0,0 +1,99 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/abs.hpp" +#include "../../math/log_exp.hpp" +#include "../../math/min_max.hpp" +#include "../../math/select.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> sinh(const vec<T, N>& x) +{ + const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x); + return (exp(xx) - exp(-xx)) * Tout(0.5); +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> cosh(const vec<T, N>& x) +{ + const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x); + return (exp(xx) + exp(-xx)) * Tout(0.5); +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> tanh(const vec<T, N>& x) +{ + const vec<Tout, N> a = exp(2 * x); + return (a - 1) / (a + 1); +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> coth(const vec<T, N>& x) +{ + const vec<Tout, N> a = exp(2 * x); + return (a + 1) / (a - 1); +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> sinhcosh(const vec<T, N>& x) +{ + const vec<Tout, N> a = exp(x); + const vec<Tout, N> b = exp(-x); + return subadd(a, b) * Tout(0.5); +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> coshsinh(const vec<T, N>& x) +{ + const vec<Tout, N> a = exp(x); + const vec<Tout, N> b = exp(-x); + return addsub(a, b) * Tout(0.5); +} + +KFR_HANDLE_SCALAR_1_T(sinh, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(cosh, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(tanh, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(coth, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(sinhcosh, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(coshsinh, flt_type<T>) +} // namespace intrinsics +KFR_I_FN(sinh) +KFR_I_FN(cosh) +KFR_I_FN(tanh) +KFR_I_FN(coth) +KFR_I_FN(sinhcosh) +KFR_I_FN(coshsinh) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/log_exp.hpp b/include/kfr/math/impl/log_exp.hpp @@ -0,0 +1,335 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/abs.hpp" +#include "../../math/clamp.hpp" +#include "../../math/min_max.hpp" +#include "../../math/round.hpp" +#include "../../math/select.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" +#include "../../simd/shuffle.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <size_t N> +KFR_INTRINSIC vec<i32, N> vilogbp1(const vec<f32, N>& d) +{ + mask<i32, N> m = d < 5.421010862427522E-20f; + vec<i32, N> q = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff; + q = select(m, q - (64 + 0x7e), q - 0x7e); + return q; +} + +template <size_t N> +KFR_INTRINSIC vec<i64, N> vilogbp1(const vec<f64, N>& d) +{ + mask<i64, N> m = d < 4.9090934652977266E-91; + vec<i64, N> q = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff; + q = select(m, q - (300 + 0x03fe), q - 0x03fe); + return q; +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q) +{ + vec<i32, N> m = q >> 31; + m = (((m + q) >> 6) - m) << 4; + const vec<i32, N> qq = q - (m << 2); + m = clamp(m + 0x7f, vec<i32, N>(0xff)); + vec<f32, N> u = pow4(bitcast<f32>(innercast<i32>(m) << 23)); + return x * u * bitcast<f32>((innercast<i32>(qq + 0x7f)) << 23); +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q) +{ + vec<i64, N> m = q >> 31; + m = (((m + q) >> 9) - m) << 7; + const vec<i64, N> qq = q - (m << 2); + m = clamp(m + 0x3ff, i64(0x7ff)); + vec<f64, N> u = pow4(bitcast<f64>(innercast<i64>(m) << 52)); + return x * u * bitcast<f64>((innercast<i64>(qq + 0x3ff)) << 52); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> logb(const vec<T, N>& x) +{ + return select(x == T(), -avoid_odr_use(c_infinity<T>), static_cast<vec<T, N>>(vilogbp1(x) - 1)); +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> log(const vec<f32, N>& d) +{ + vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f ); + vec<f32, N> m = vldexpk(d, -e); + + vec<f32, N> x = (m - 1.0f) / (m + 1.0f); + vec<f32, N> x2 = x * x; + + vec<f32, N> sp = + select(d < 0, avoid_odr_use(constants<f32>::qnan), avoid_odr_use(constants<f32>::neginfinity)); + + vec<f32, N> t = 0.2371599674224853515625f; + t = fmadd(t, x2, 0.285279005765914916992188f); + t = fmadd(t, x2, 0.400005519390106201171875f); + t = fmadd(t, x2, 0.666666567325592041015625f); + t = fmadd(t, x2, 2.0f); + + x = x * t + c_log_2<f32> * innercast<f32>(e); + x = select(d > 0, x, sp); + + return x; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> log(const vec<f64, N>& d) +{ + vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 ); + vec<f64, N> m = vldexpk(d, -e); + + vec<f64, N> x = (m - 1.0) / (m + 1.0); + vec<f64, N> x2 = x * x; + + vec<f64, N> sp = + select(d < 0, avoid_odr_use(constants<f64>::qnan), avoid_odr_use(constants<f64>::neginfinity)); + + vec<f64, N> t = 0.148197055177935105296783; + t = fmadd(t, x2, 0.153108178020442575739679); + t = fmadd(t, x2, 0.181837339521549679055568); + t = fmadd(t, x2, 0.22222194152736701733275); + t = fmadd(t, x2, 0.285714288030134544449368); + t = fmadd(t, x2, 0.399999999989941956712869); + t = fmadd(t, x2, 0.666666666666685503450651); + t = fmadd(t, x2, 2); + + x = x * t + avoid_odr_use(constants<f64>::log_2) * innercast<f64>(e); + x = select(d > 0, x, sp); + + return x; +} + +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> log2(const vec<T, N>& x) +{ + return log(innercast<Tout>(x)) * avoid_odr_use(constants<Tout>::recip_log_2); +} +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> log10(const vec<T, N>& x) +{ + return log(innercast<Tout>(x)) * avoid_odr_use(constants<Tout>::recip_log_10); +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> exp(const vec<f32, N>& d) +{ + const f32 ln2_part1 = 0.6931457519f; + const f32 ln2_part2 = 1.4286067653e-6f; + + vec<i32, N> q = innercast<i32>(floor(d * avoid_odr_use(constants<f32>::recip_log_2))); + vec<f32, N> s, u; + + s = fmadd(innercast<f32>(q), -ln2_part1, d); + s = fmadd(innercast<f32>(q), -ln2_part2, s); + + const f32 c2 = 0.4999999105930328369140625f; + const f32 c3 = 0.166668415069580078125f; + const f32 c4 = 4.16539050638675689697265625e-2f; + const f32 c5 = 8.378830738365650177001953125e-3f; + const f32 c6 = 1.304379315115511417388916015625e-3f; + const f32 c7 = 2.7555381529964506626129150390625e-4f; + + u = c7; + u = fmadd(u, s, c6); + u = fmadd(u, s, c5); + u = fmadd(u, s, c4); + u = fmadd(u, s, c3); + u = fmadd(u, s, c2); + + u = s * s * u + s + 1.0f; + u = vldexpk(u, q); + + u = select(d == avoid_odr_use(constants<f32>::neginfinity), 0.f, u); + + return u; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> exp(const vec<f64, N>& d) +{ + const f64 ln2_part1 = 0.69314717501401901245; + const f64 ln2_part2 = 5.545926273775592108e-009; + + vec<i64, N> q = innercast<i64>(floor(d * avoid_odr_use(constants<f64>::recip_log_2))); + vec<f64, N> s, u; + + s = fmadd(innercast<f64>(q), -ln2_part1, d); + s = fmadd(innercast<f64>(q), -ln2_part2, s); + + const f64 c2 = 0.499999999999994948485237955537741072475910186767578; + const f64 c3 = 0.166666666667024204739888659787538927048444747924805; + const f64 c4 = 4.16666666578945840693215529881854308769106864929199e-2; + const f64 c5 = 8.3333334397461874404333670440792047884315252304077e-3; + const f64 c6 = 1.3888881489747750223179290074426717183087021112442e-3; + const f64 c7 = 1.9841587032493949419205414574918222569976933300495e-4; + const f64 c8 = 2.47929324077393282239802768662784160369483288377523e-5; + const f64 c9 = 2.77076037925831049422552981864598109496000688523054e-6; + const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7; + const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8; + + u = c11; + u = fmadd(u, s, c10); + u = fmadd(u, s, c9); + u = fmadd(u, s, c8); + u = fmadd(u, s, c7); + u = fmadd(u, s, c6); + u = fmadd(u, s, c5); + u = fmadd(u, s, c4); + u = fmadd(u, s, c3); + u = fmadd(u, s, c2); + + u = s * s * u + s + 1.0; + u = vldexpk(u, q); + + u = select(d == avoid_odr_use(constants<f64>::neginfinity), 0.0, u); + + return u; +} +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> exp2(const vec<T, N>& x) +{ + return exp(x * avoid_odr_use(constants<Tout>::log_2)); +} +template <typename T, size_t N, typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> exp10(const vec<T, N>& x) +{ + return exp(x * avoid_odr_use(constants<Tout>::log_10)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b) +{ + const vec<T, N> t = exp(b * log(abs(a))); + const mask<T, N> isint = floor(b) == b; + const mask<T, N> iseven = (innercast<itype<T>>(b) & 1) == 0; + return select( + a > T(), t, + select(a == T(), T(), + select(isint, select(iseven, t, -t), broadcast<N>(avoid_odr_use(constants<T>::qnan))))); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b) +{ + return exp(reciprocal(b) * log(x)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> cbrt(const vec<T, N>& x) +{ + return pow<T, N>(x, T(0.333333333333333333333333333333333)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> +KFR_INTRINSIC vec<Tout, N> cbrt(const vec<T, N>& x) +{ + return cbrt(innercast<Tout>(x)); +} + +KFR_HANDLE_SCALAR_1_T(exp, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(exp2, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(exp10, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(log, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(log2, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(log10, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(logb, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(pow, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(root, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(cbrt, flt_type<T>) + +KFR_HANDLE_ARGS_T(exp, flt_type<T>) +KFR_HANDLE_ARGS_T(exp2, flt_type<T>) +KFR_HANDLE_ARGS_T(exp10, flt_type<T>) +KFR_HANDLE_ARGS_T(log, flt_type<T>) +KFR_HANDLE_ARGS_T(log2, flt_type<T>) +KFR_HANDLE_ARGS_T(log10, flt_type<T>) +KFR_HANDLE_ARGS_T(logb, flt_type<T>) +KFR_HANDLE_ARGS_T(pow, flt_type<T>) +KFR_HANDLE_ARGS_T(root, flt_type<T>) +KFR_HANDLE_ARGS_T(cbrt, flt_type<T>) + +KFR_HANDLE_NOT_F_1(exp) +KFR_HANDLE_NOT_F_1(log) +KFR_HANDLE_NOT_F_1(logb) +KFR_HANDLE_NOT_F_1(pow) +KFR_HANDLE_NOT_F_1(root) +KFR_HANDLE_NOT_F_1(cbrt) + +template <typename T1, typename T2> +KFR_INTRINSIC flt_type<common_type<T1, T2>> logn(const T1& a, const T2& b) +{ + return log(a) / log(b); +} + +template <typename T1, typename T2> +KFR_INTRINSIC flt_type<common_type<T1, T2>> logm(const T1& a, const T2& b) +{ + return log(a) * b; +} + +template <typename T1, typename T2, typename T3> +KFR_INTRINSIC flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a) +{ + return exp(fmadd(x, m, a)); +} + +template <typename T1, typename T2, typename T3> +KFR_INTRINSIC flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a) +{ + return fmadd(log(x), m, a); +} +} // namespace intrinsics +KFR_I_FN(exp) +KFR_I_FN(exp2) +KFR_I_FN(exp10) +KFR_I_FN(log) +KFR_I_FN(log2) +KFR_I_FN(log10) +KFR_I_FN(logb) +KFR_I_FN(logn) +KFR_I_FN(logm) +KFR_I_FN(exp_fmadd) +KFR_I_FN(log_fmadd) +KFR_I_FN(pow) +KFR_I_FN(root) +KFR_I_FN(cbrt) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp @@ -0,0 +1,278 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/abs.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +#if defined CMT_ARCH_SSE41 + +// horizontal OR +KFR_INTRINSIC bool bittestany(const u8sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const u16sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const u32sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const u64sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i8sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i16sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i32sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i64sse& x) { return !_mm_testz_si128(x.v, x.v); } + +// horizontal AND +KFR_INTRINSIC bool bittestall(const u8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const u16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const u32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const u64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +#endif + +#if defined CMT_ARCH_AVX +// horizontal OR +KFR_INTRINSIC bool bittestany(const f32sse& x) { return !_mm_testz_ps(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const f64sse& x) { return !_mm_testz_pd(x.v, x.v); } + +KFR_INTRINSIC bool bittestany(const f32avx& x) { return !_mm256_testz_ps(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const f64avx& x) { return !_mm256_testz_pd(x.v, x.v); } + +KFR_INTRINSIC bool bittestany(const u8avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const u16avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const u32avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const u64avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i8avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i16avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i32avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const i64avx& x) { return !_mm256_testz_si256(x.v, x.v); } + +// horizontal AND +KFR_INTRINSIC bool bittestall(const f32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const f64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); } + +KFR_INTRINSIC bool bittestall(const f32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const f64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); } + +KFR_INTRINSIC bool bittestall(const u8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const u16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const u32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const u64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const i64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } + +#if defined CMT_ARCH_AVX512 +// horizontal OR +KFR_INTRINSIC bool bittestany(const f32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); } +KFR_INTRINSIC bool bittestany(const f64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } +KFR_INTRINSIC bool bittestany(const u8avx512& x) { return _mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestany(const u16avx512& x) { return _mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestany(const u32avx512& x) { return _mm512_movepi32_mask(x.v); } +KFR_INTRINSIC bool bittestany(const u64avx512& x) { return _mm512_movepi64_mask(x.v); } +KFR_INTRINSIC bool bittestany(const i8avx512& x) { return _mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestany(const i16avx512& x) { return _mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestany(const i32avx512& x) { return _mm512_movepi32_mask(x.v); } +KFR_INTRINSIC bool bittestany(const i64avx512& x) { return _mm512_movepi64_mask(x.v); } + +// horizontal AND +KFR_INTRINSIC bool bittestall(const f32avx512& x) { return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); } +KFR_INTRINSIC bool bittestall(const f64avx512& x) { return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } +KFR_INTRINSIC bool bittestall(const u8avx512& x) { return !~_mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestall(const u16avx512& x) { return !~_mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestall(const u32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const u64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const i8avx512& x) { return !~_mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestall(const i16avx512& x) { return !~_mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestall(const i32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const i64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } + +#endif + +#elif defined CMT_ARCH_SSE41 +KFR_INTRINSIC bool bittestany(const f32sse& x) +{ + return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v); +} +KFR_INTRINSIC bool bittestany(const f64sse& x) +{ + return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v); +} +KFR_INTRINSIC bool bittestall(const f32sse& x) +{ + return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v); +} +KFR_INTRINSIC bool bittestall(const f64sse& x) +{ + return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v); +} +#endif + +#if !defined CMT_ARCH_SSE41 + +KFR_INTRINSIC bool bittestany(const f32sse& x) { return _mm_movemask_ps(x.v); } +KFR_INTRINSIC bool bittestany(const f64sse& x) { return _mm_movemask_pd(x.v); } +KFR_INTRINSIC bool bittestany(const u8sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const u16sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const u32sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const u64sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const i8sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const i16sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const i32sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const i64sse& x) { return _mm_movemask_epi8(x.v); } + +KFR_INTRINSIC bool bittestall(const f32sse& x) { return !_mm_movemask_ps((~x).v); } +KFR_INTRINSIC bool bittestall(const f64sse& x) { return !_mm_movemask_pd((~x).v); } +KFR_INTRINSIC bool bittestall(const u8sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const u16sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const u32sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const u64sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const i8sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const i16sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const i32sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const i64sse& x) { return !_mm_movemask_epi8((~x).v); } +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +{ + return bittestall(expand_simd(a, internal::maskbits<T>(true))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +{ + return bittestall(low(a)) && bittestall(high(a)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +{ + return bittestany(expand_simd(a, internal::maskbits<T>(false))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +{ + return bittestany(low(a)) || bittestany(high(a)); +} + +#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC bool bittestall(const u32neon& a) +{ + const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v)); + return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; +} + +KFR_INTRINSIC bool bittestany(const u32neon& a) +{ + const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v)); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; +} +KFR_INTRINSIC bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); } + +KFR_INTRINSIC bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); } +KFR_INTRINSIC bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); } + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +{ + return bittestall(expand_simd(a, internal::maskbits<T>(true))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestall(const vec<T, N>& a) +{ + return bittestall(low(a)) && bittestall(high(a)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +{ + return bittestany(expand_simd(a, internal::maskbits<T>(false))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestany(const vec<T, N>& a) +{ + return bittestany(low(a)) || bittestany(high(a)); +} + +#else + +template <typename T, size_t N> +KFR_INTRINSIC bitmask<N> getmask(const vec<T, N>& x) +{ + typename bitmask<N>::type val = 0; + for (size_t i = 0; i < N; i++) + { + val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i; + } + return val; +} + +template <typename T, size_t N> +KFR_INTRINSIC bool bittestany(const vec<T, N>& x) +{ + return getmask(x).value; +} +template <typename T, size_t N> +KFR_INTRINSIC bool bittestany(const vec<T, N>& x, const vec<T, N>& y) +{ + return bittestany(x & y); +} + +template <typename T, size_t N> +KFR_INTRINSIC bool bittestall(const vec<T, N>& x) +{ + return !getmask(~x).value; +} +template <typename T, size_t N> +KFR_INTRINSIC bool bittestall(const vec<T, N>& x, const vec<T, N>& y) +{ + return !bittestany(~x & y); +} +#endif +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/min_max.hpp b/include/kfr/math/impl/min_max.hpp @@ -0,0 +1,236 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/abs.hpp" +#include "../../math/select.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); } +KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); } +KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); } +KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); } + +KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); } +KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); } +KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); } +KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); } + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); } + +KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); } + +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); } + +KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); } +KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); } +KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); } +KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); } + +KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); } +KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); } + +KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); } +KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); } +#else +KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); } +KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); } +KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); } +KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); } +#endif + +#if defined CMT_ARCH_AVX +KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); } +KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); } +KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); } +KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); } +#endif + +#if defined CMT_ARCH_SSE41 +KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); } +KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); } +KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); } + +KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); } +KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); } +KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); } +#else +KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); } + +KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); } + +#endif + +KFR_HANDLE_ALL_SIZES_2(min) +KFR_HANDLE_ALL_SIZES_2(max) + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); } +KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); } +KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); } +KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); } +KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); } +KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); } +KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); } +KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); } + +KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); } +KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); } +KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); } +KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); } +KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); } +KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); } +KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); } +KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); } + +KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); } +KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); } +#if defined CMT_ARCH_NEON64 +KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); } +KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); } +#else +KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); } +KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); } +#endif + +KFR_HANDLE_ALL_SIZES_2(min) +KFR_HANDLE_ALL_SIZES_2(max) + +#else + +// fallback +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y) +{ + return select(x < y, x, y); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y) +{ + return select(x > y, x, y); +} +#endif + +template <typename T> +KFR_INTRINSIC T min(initialvalue<T>) +{ + return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::max(); +} +template <typename T> +KFR_INTRINSIC T max(initialvalue<T>) +{ + return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::min(); +} +template <typename T> +KFR_INTRINSIC T absmin(initialvalue<T>) +{ + return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::max(); +} +template <typename T> +KFR_INTRINSIC T absmax(initialvalue<T>) +{ + return 0; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y) +{ + return min(abs(x), abs(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y) +{ + return max(abs(x), abs(y)); +} + +KFR_HANDLE_SCALAR(min) +KFR_HANDLE_SCALAR(max) +KFR_HANDLE_SCALAR(absmin) +KFR_HANDLE_SCALAR(absmax) +} // namespace intrinsics +KFR_I_FN(min) +KFR_I_FN(max) +KFR_I_FN(absmin) +KFR_I_FN(absmax) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/modzerobessel.hpp b/include/kfr/math/impl/modzerobessel.hpp @@ -0,0 +1,104 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/log_exp.hpp" +#include "../../simd/impl/function.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wc99-extensions") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") +#endif + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> modzerobessel(const vec<T, N>& x) +{ + constexpr static T bessel_coef[] = { T(0.25), + T(0.027777777777777776236), + T(0.0017361111111111110147), + T(6.9444444444444444384e-005), + T(1.9290123456790123911e-006), + T(3.9367598891408417495e-008), + T(6.1511873267825652335e-010), + T(7.5940584281266239246e-012), + T(7.5940584281266233693e-014), + T(6.2760813455591932909e-016), + T(4.3583898233049949985e-018), + T(2.5789288895295827557e-020), + T(1.3157800456783586208e-022), + T(5.8479113141260384983e-025), + T(2.2843403570804837884e-027), + T(7.904291893012054025e-030), + T(2.4395962632753252792e-032), + T(6.75788438580422547e-035), + T(1.689471096451056426e-037), + T(3.8310002187098784929e-040), + T(7.9152897080782616517e-043), + T(1.4962740468957016443e-045), + T(2.5976979980828152196e-048), + T(4.1563167969325041577e-051), + T(6.1483976285983795968e-054), + T(8.434015951438105991e-057), + T(1.0757673407446563809e-059), + T(1.2791526049282476926e-062), + T(1.4212806721424974034e-065), + T(1.4789601166935457918e-068), + T(1.4442969889585408123e-071), + T(1.3262598613026086927e-074), + T(1.1472836170437790782e-077), + T(9.3655805472961564331e-081), + T(7.2265282000741942594e-084), + T(5.2786911614858977913e-087), + T(3.6556032974279072401e-090), + T(2.4034209713529963119e-093), + T(1.5021381070956226783e-096) }; + + const vec<T, N> x_2 = x * 0.5; + const vec<T, N> x_2_sqr = x_2 * x_2; + vec<T, N> num = x_2_sqr; + vec<T, N> result; + result = 1 + x_2_sqr; + + CMT_LOOP_UNROLL + for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++) + { + result = fmadd((num *= x_2_sqr), bessel_coef[i], result); + } + return result; +} + +KFR_HANDLE_SCALAR(modzerobessel) +} // namespace intrinsics +KFR_I_FN(modzerobessel) +} // namespace CMT_ARCH_NAME +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/math/impl/round.hpp b/include/kfr/math/impl/round.hpp @@ -0,0 +1,282 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" +#include "abs.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) + +#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_ss(V) \ + _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_sd(V) \ + _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) + +#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V)) +#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V)) +#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V)) +#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V)) + +#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) + +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); } +KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); } +KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); } +KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); } +KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); } +KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); } +KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); } +KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); } +KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); } +KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); } + +#if defined CMT_ARCH_AVX + +KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); } +KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); } +KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); } +KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); } +KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); } +KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); } +KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); } +KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); } +KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); } +KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); } + +#endif + +#if defined CMT_ARCH_AVX512 + +KFR_INTRINSIC f32avx512 floor(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 ceil(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 trunc(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 round(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 floor(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 ceil(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 trunc(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 round(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); } +KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); } +#endif + +KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>::value) +KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>::value) +KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>::value) +KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>::value) +KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>::value) + +#else + +// fallback + +template <typename T> +constexpr T fp_precision_limit = 4503599627370496.0; +template <> +constexpr f32 fp_precision_limit<f32> = 16777216.0f; + +template <size_t N> +KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x) +{ + vec<f32, N> t = innercast<f32>(innercast<i32>(x)); + return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f)); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x) +{ + vec<f64, N> t = innercast<f64>(innercast<i64>(x)); + return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.)); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x) +{ + vec<f32, N> t = innercast<f32>(innercast<i32>(x)); + return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f)); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x) +{ + vec<f64, N> t = innercast<f64>(innercast<i64>(x)); + return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.)); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f32>, x, + innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x)))); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f64>, x, + innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x)))); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x))); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x))); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x) +{ + return x - floor(x); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x) +{ + return x - floor(x); +} +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&) +{ + return T(0); +} + +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value) +{ + return innercast<IT>(floor(value)); +} +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value) +{ + return innercast<IT>(ceil(value)); +} +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value) +{ + return innercast<IT>(trunc(value)); +} +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value) +{ + return innercast<IT>(round(value)); +} + +KFR_HANDLE_SCALAR(floor) +KFR_HANDLE_SCALAR(ceil) +KFR_HANDLE_SCALAR(round) +KFR_HANDLE_SCALAR(trunc) +KFR_HANDLE_SCALAR(fract) +KFR_HANDLE_SCALAR(ifloor) +KFR_HANDLE_SCALAR(iceil) +KFR_HANDLE_SCALAR(iround) +KFR_HANDLE_SCALAR(itrunc) +} // namespace intrinsics +KFR_I_FN(floor) +KFR_I_FN(ceil) +KFR_I_FN(round) +KFR_I_FN(trunc) +KFR_I_FN(fract) +KFR_I_FN(ifloor) +KFR_I_FN(iceil) +KFR_I_FN(iround) +KFR_I_FN(itrunc) +} // namespace CMT_ARCH_NAME +} // namespace kfr + +#undef KFR_mm_trunc_ps +#undef KFR_mm_roundnearest_ps +#undef KFR_mm_trunc_pd +#undef KFR_mm_roundnearest_pd +#undef KFR_mm_trunc_ss +#undef KFR_mm_roundnearest_ss +#undef KFR_mm_trunc_sd +#undef KFR_mm_roundnearest_sd +#undef KFR_mm_floor_ss +#undef KFR_mm_floor_sd +#undef KFR_mm_ceil_ss +#undef KFR_mm_ceil_sd +#undef KFR_mm256_trunc_ps +#undef KFR_mm256_roundnearest_ps +#undef KFR_mm256_trunc_pd +#undef KFR_mm256_roundnearest_pd diff --git a/include/kfr/math/impl/saturation.hpp b/include/kfr/math/impl/saturation.hpp @@ -0,0 +1,205 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/select.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +// Generic functions +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b) +{ + using UT = utype<T>; + constexpr size_t shift = typebits<UT>::bits - 1; + vec<UT, N> aa = bitcast<UT>(a); + vec<UT, N> bb = bitcast<UT>(b); + const vec<UT, N> sum = aa + bb; + aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); + + return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b) +{ + using UT = utype<T>; + constexpr size_t shift = typebits<UT>::bits - 1; + vec<UT, N> aa = bitcast<UT>(a); + vec<UT, N> bb = bitcast<UT>(b); + const vec<UT, N> diff = aa - bb; + aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); + + return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b) +{ + const vec<T, N> t = allonesvector(a); + return select(a > t - b, t, a + b); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b) +{ + return select(a < b, zerovector(a), a - b); +} + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); } +KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); } +KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); } + +KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); } +KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); } +KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); } + +KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); } +KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); } + +KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); } +KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); } + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); } + +KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); } + +KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); } +KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); } + +KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); } +KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); } +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); } +KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); } + +KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b) +{ + return saturated_unsigned_add(a, b); +} +KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b) +{ + return saturated_unsigned_add(a, b); +} +KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b) +{ + return saturated_unsigned_sub(a, b); +} +KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b) +{ + return saturated_unsigned_sub(a, b); +} +#endif + +KFR_HANDLE_ALL_SIZES_2(satadd) +KFR_HANDLE_ALL_SIZES_2(satsub) + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); } +KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); } +KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); } +KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); } +KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); } +KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); } +KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); } +KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); } + +KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); } +KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); } +KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); } +KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); } +KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); } +KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); } +KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); } +KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); } + +KFR_HANDLE_ALL_SIZES_2(satadd) +KFR_HANDLE_ALL_SIZES_2(satsub) + +#else +// fallback +template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> +KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_signed_add(a, b); +} +template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)> +KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_unsigned_add(a, b); +} +template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> +KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_signed_sub(a, b); +} +template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)> +KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_unsigned_sub(a, b); +} +#endif +KFR_HANDLE_SCALAR(satadd) +KFR_HANDLE_SCALAR(satsub) +} // namespace intrinsics +KFR_I_FN(satadd) +KFR_I_FN(satsub) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp @@ -0,0 +1,329 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC u8sse select(const u8sse& m, const u8sse& x, const u8sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u16sse select(const u16sse& m, const u16sse& x, const u16sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u32sse select(const u32sse& m, const u32sse& x, const u32sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u64sse select(const u64sse& m, const u64sse& x, const u64sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i8sse select(const i8sse& m, const i8sse& x, const i8sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i16sse select(const i16sse& m, const i16sse& x, const i16sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i32sse select(const i32sse& m, const i32sse& x, const i32sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i64sse select(const i64sse& m, const i64sse& x, const i64sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC f32sse select(const f32sse& m, const f32sse& x, const f32sse& y) +{ + return _mm_blendv_ps(y.v, x.v, m.v); +} +KFR_INTRINSIC f64sse select(const f64sse& m, const f64sse& x, const f64sse& y) +{ + return _mm_blendv_pd(y.v, x.v, m.v); +} + +#if defined CMT_ARCH_AVX +KFR_INTRINSIC f64avx select(const f64avx& m, const f64avx& x, const f64avx& y) +{ + return _mm256_blendv_pd(y.v, x.v, m.v); +} +KFR_INTRINSIC f32avx select(const f32avx& m, const f32avx& x, const f32avx& y) +{ + return _mm256_blendv_ps(y.v, x.v, m.v); +} +#endif + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC u8avx select(const u8avx& m, const u8avx& x, const u8avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u16avx select(const u16avx& m, const u16avx& x, const u16avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u32avx select(const u32avx& m, const u32avx& x, const u32avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u64avx select(const u64avx& m, const u64avx& x, const u64avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i8avx select(const i8avx& m, const i8avx& x, const i8avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i16avx select(const i16avx& m, const i16avx& x, const i16avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i32avx select(const i32avx& m, const i32avx& x, const i32avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i64avx select(const i64avx& m, const i64avx& x, const i64avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC f64avx512 select(const f64avx512& m, const f64avx512& x, const f64avx512& y) +{ + return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v); +} +KFR_INTRINSIC f32avx512 select(const f32avx512& m, const f32avx512& x, const f32avx512& y) +{ + return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v); +} +KFR_INTRINSIC u8avx512 select(const u8avx512& m, const u8avx512& x, const u8avx512& y) +{ + return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC u16avx512 select(const u16avx512& m, const u16avx512& x, const u16avx512& y) +{ + return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC u32avx512 select(const u32avx512& m, const u32avx512& x, const u32avx512& y) +{ + return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC u64avx512 select(const u64avx512& m, const u64avx512& x, const u64avx512& y) +{ + return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i8avx512 select(const i8avx512& m, const i8avx512& x, const i8avx512& y) +{ + return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i16avx512 select(const i16avx512& m, const i16avx512& x, const i16avx512& y) +{ + return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i32avx512 select(const i32avx512& m, const i32avx512& x, const i32avx512& y) +{ + return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i64avx512 select(const i64avx512& m, const i64avx512& x, const i64avx512& y) +{ + return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); +} +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) + .shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + vec<T, N> r; + intrin(r, a, b, c, [](auto x, auto y, auto z) { return intrinsics::select(x, y, z); }); + return r; + // return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c))); + // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c) +{ + return concat2(select(a.h.low, b, c), select(a.h.high, b, c)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c) +{ + return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>)) + .shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c) +{ + return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high)); +} + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32neon select(const f32neon& m, const f32neon& x, const f32neon& y) +{ + return vbslq_f32(m.v, x.v, y.v); +} +KFR_INTRINSIC i8neon select(const i8neon& m, const i8neon& x, const i8neon& y) +{ + return vbslq_s8(m.v, x.v, y.v); +} +KFR_INTRINSIC u8neon select(const u8neon& m, const u8neon& x, const u8neon& y) +{ + return vbslq_u8(m.v, x.v, y.v); +} +KFR_INTRINSIC i16neon select(const i16neon& m, const i16neon& x, const i16neon& y) +{ + return vbslq_s16(m.v, x.v, y.v); +} +KFR_INTRINSIC u16neon select(const u16neon& m, const u16neon& x, const u16neon& y) +{ + return vbslq_u16(m.v, x.v, y.v); +} +KFR_INTRINSIC i32neon select(const i32neon& m, const i32neon& x, const i32neon& y) +{ + return vbslq_s32(m.v, x.v, y.v); +} +KFR_INTRINSIC u32neon select(const u32neon& m, const u32neon& x, const u32neon& y) +{ + return vbslq_u32(m.v, x.v, y.v); +} +KFR_INTRINSIC i64neon select(const i64neon& m, const i64neon& x, const i64neon& y) +{ + return vbslq_s64(m.v, x.v, y.v); +} +KFR_INTRINSIC u64neon select(const u64neon& m, const u64neon& x, const u64neon& y) +{ + return vbslq_u64(m.v, x.v, y.v); +} + +#ifdef CMT_ARCH_NEON64 +KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y) +{ + return vbslq_f64(m.v, x.v, y.v); +} +#else +KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y) +{ + return y ^ ((x ^ y) & m); +} +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) + .shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y) +{ + return select(m, vec<T, N>(x), vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y) +{ + return select(m, x, vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y) +{ + return select(m, vec<T, N>(x), y); +} + +#else + +// fallback +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y) +{ + return y ^ ((x ^ y) & m); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y) +{ + return select(m, vec<T, N>(x), vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y) +{ + return select(m, x, vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y) +{ + return select(m, vec<T, N>(x), y); +} +#endif + +} // namespace intrinsics +KFR_I_FN(select) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/sin_cos.hpp b/include/kfr/math/impl/sin_cos.hpp @@ -0,0 +1,310 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/abs.hpp" +#include "../../math/min_max.hpp" +#include "../../math/round.hpp" +#include "../../math/select.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" +#include "../../simd/shuffle.hpp" + +#if CMT_HAS_WARNING("-Wc99-extensions") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") +#endif + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0) +{ + return select(msk, a0, b0); +} + +template <typename T, size_t N, typename... Ts> +KFR_INTRINSIC vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0, + const T& a1, const T& b1, const Ts&... values) +{ + return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant) +{ + const vec<T, N> xabs = abs(x); + constexpr T div = constants<T>::fold_constant_div; + vec<T, N> y = floor(xabs / div); + quadrant = innercast<itype<T>>(innercast<int>(y - floor(y * T(1.0 / 16.0)) * T(16.0))); + + const mask<T, N> msk = (quadrant & 1) != 0; + quadrant = kfr::select(msk, quadrant + 1, quadrant); + y = select(msk, y + T(1.0), y); + quadrant = quadrant & 7; + + constexpr T hi = constants<T>::fold_constant_hi; + constexpr T rem1 = constants<T>::fold_constant_rem1; + constexpr T rem2 = constants<T>::fold_constant_rem2; + return (xabs - y * hi) - y * rem1 - y * rem2; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> fold_range(const vec<T, N>& x) +{ + vec<itype<T>, N> q; + return trig_fold(x, q); +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask) +{ + constexpr f32 sin_c2 = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f); + constexpr f32 sin_c4 = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f); + constexpr f32 sin_c6 = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f); + constexpr f32 sin_c8 = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f); + constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f); + constexpr f32 cos_c2 = CMT_FP(-0x8.p-4f, -5.0000000000e-01f); + constexpr f32 cos_c4 = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f); + constexpr f32 cos_c6 = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f); + constexpr f32 cos_c8 = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f); + constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f); + + const vec<f32, N> x2 = folded * folded; + + vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, + cos_c8, sin_c8, cos_c10, sin_c10); + + formula = select(cosmask, formula, formula * folded); + return formula; +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask) +{ + constexpr f64 sin_c2 = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01); + constexpr f64 sin_c4 = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03); + constexpr f64 sin_c6 = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04); + constexpr f64 sin_c8 = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06); + constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08); + constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10); + constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13); + constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14); + constexpr f64 cos_c2 = CMT_FP(-0x8.p-4, -5.000000000000000000e-01); + constexpr f64 cos_c4 = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02); + constexpr f64 cos_c6 = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03); + constexpr f64 cos_c8 = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05); + constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07); + constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09); + constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11); + constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14); + + vec<f64, N> x2 = folded * folded; + vec<f64, N> formula = + trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8, + cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16); + + formula = select(cosmask, formula, formula * folded); + return formula; +} + +template <typename T, size_t N, KFR_ENABLE_IF(N > 1)> +KFR_INTRINSIC vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask) +{ + vec<itype<T>, N> quadrant; + vec<T, N> folded = trig_fold(x_full, quadrant); + + mask<T, N> flip_sign = + kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask(); + + mask<T, N> usecos = (quadrant == 2) || (quadrant == 6); + usecos = usecos ^ cosmask; + + vec<T, N> formula = trig_sincos(folded, usecos); + + mask<T, N> negmask = x_full < T(0); + + flip_sign = flip_sign ^ (negmask & ~cosmask); + + formula = select(flip_sign, -formula, formula); + return formula; +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> sin(const vec<T, N>& x) +{ + vec<itype<T>, N> quadrant; + vec<T, N> folded = trig_fold(x, quadrant); + + mask<T, N> flip_sign = quadrant >= itype<T>(4); + mask<T, N> usecos = (quadrant == itype<T>(2)) || (quadrant == itype<T>(6)); + + vec<T, N> formula = trig_sincos(folded, usecos); + + formula = select(flip_sign ^ mask<T, N>(x), -formula, formula); + return formula; +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> cos(const vec<T, N>& x) +{ + vec<itype<T>, N> quadrant; + vec<T, N> folded = trig_fold(x, quadrant); + + mask<T, N> eq4 = (quadrant == 4); + mask<T, N> flip_sign = (quadrant == 2) || eq4; + mask<T, N> usecos = (quadrant == 0) || eq4; + + vec<T, N> formula = trig_sincos(folded, usecos); + + formula = select(flip_sign, -formula, formula); + return formula; +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> fastsin(const vec<T, N>& x) +{ + const vec<T, N> msk = broadcast<N>(special_constants<T>::highbitmask()); + + constexpr static T c2 = -0.16665853559970855712890625; + constexpr static T c4 = +8.31427983939647674560546875e-3; + constexpr static T c6 = -1.85423981747590005397796630859375e-4; + + const vec<T, N> pi = c_pi<T>; + + vec<T, N> xx = x - pi; + vec<T, N> y = abs(xx); + y = select(y > c_pi<T, 1, 2>, pi - y, y); + y = y ^ (msk & ~xx); + + vec<T, N> y2 = y * y; + vec<T, N> formula = c6; + vec<T, N> y3 = y2 * y; + formula = fmadd(formula, y2, c4); + formula = fmadd(formula, y2, c2); + formula = formula * y3 + y; + return formula; +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> fastcos(const vec<T, N>& x) +{ + x += c_pi<T, 1, 2>; + x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x); + return fastsin(x); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> sincos(const vec<T, N>& x) +{ + return sincos_mask(x, internal::oddmask<T, N>()); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> cossin(const vec<T, N>& x) +{ + return sincos_mask(x, internal::evenmask<T, N>()); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> sinc(const vec<T, N>& x) +{ + return select(abs(x) <= avoid_odr_use(constants<T>::epsilon), T(1), sin(x) / x); +} + +KFR_HANDLE_SCALAR_1_T(sin, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(cos, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(fastsin, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(fastcos, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(sincos, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(cossin, flt_type<T>) +KFR_HANDLE_SCALAR_1_T(sinc, flt_type<T>) + +KFR_HANDLE_NOT_F_1(sin) +KFR_HANDLE_NOT_F_1(cos) +KFR_HANDLE_NOT_F_1(fastsin) +KFR_HANDLE_NOT_F_1(fastcos) +KFR_HANDLE_NOT_F_1(sincos) +KFR_HANDLE_NOT_F_1(cossin) +KFR_HANDLE_NOT_F_1(sinc) + +template <typename T, typename Tout = flt_type<T>> +KFR_INTRINSIC Tout sindeg(const T& x) +{ + return sin(x * avoid_odr_use(constants<Tout>::degtorad)); +} + +template <typename T, typename Tout = flt_type<T>> +KFR_INTRINSIC Tout cosdeg(const T& x) +{ + return cos(x * avoid_odr_use(constants<Tout>::degtorad)); +} + +template <typename T, typename Tout = flt_type<T>> +KFR_INTRINSIC Tout fastsindeg(const T& x) +{ + return fastsin(x * avoid_odr_use(constants<Tout>::degtorad)); +} + +template <typename T, typename Tout = flt_type<T>> +KFR_INTRINSIC Tout fastcosdeg(const T& x) +{ + return fastcos(x * avoid_odr_use(constants<Tout>::degtorad)); +} + +template <typename T, typename Tout = flt_type<T>> +KFR_INTRINSIC Tout sincosdeg(const T& x) +{ + return sincos(x * avoid_odr_use(constants<Tout>::degtorad)); +} + +template <typename T, typename Tout = flt_type<T>> +KFR_INTRINSIC Tout cossindeg(const T& x) +{ + return cossin(x * avoid_odr_use(constants<Tout>::degtorad)); +} +} // namespace intrinsics + +KFR_I_FN(sin) +KFR_I_FN(cos) +KFR_I_FN(fastsin) +KFR_I_FN(fastcos) +KFR_I_FN(sincos) +KFR_I_FN(cossin) + +KFR_I_FN(sindeg) +KFR_I_FN(cosdeg) +KFR_I_FN(fastsindeg) +KFR_I_FN(fastcosdeg) +KFR_I_FN(sincosdeg) +KFR_I_FN(cossindeg) + +KFR_I_FN(sinc) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/sqrt.hpp b/include/kfr/math/impl/sqrt.hpp @@ -0,0 +1,72 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../simd/impl/function.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32x1 sqrt(const f32x1& x) { return slice<0, 1>(f32x4(_mm_sqrt_ss(extend<4>(x).v))); } +KFR_INTRINSIC f64x1 sqrt(const f64x1& x) +{ + return slice<0, 1>(f64x2(_mm_sqrt_sd(_mm_setzero_pd(), extend<2>(x).v))); +} +KFR_INTRINSIC f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(x.v); } +KFR_INTRINSIC f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(x.v); } + +#if defined CMT_ARCH_AVX +KFR_INTRINSIC f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(x.v); } +KFR_INTRINSIC f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(x.v); } +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC f32avx512 sqrt(const f32avx512& x) { return _mm512_sqrt_ps(x.v); } +KFR_INTRINSIC f64avx512 sqrt(const f64avx512& x) { return _mm512_sqrt_pd(x.v); } +#endif + +KFR_HANDLE_ALL_SIZES_1_IF(sqrt, is_f_class<T>::value) + +#else + +// fallback +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> sqrt(const vec<T, N>& x) +{ + return apply([](T x) { return std::sqrt(x); }, x); +} +#endif +KFR_HANDLE_SCALAR_1_T(sqrt, flt_type<T>) + +KFR_HANDLE_NOT_F_1(sqrt) +} // namespace intrinsics +KFR_I_FN(sqrt) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/impl/tan.hpp b/include/kfr/math/impl/tan.hpp @@ -0,0 +1,149 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../math/abs.hpp" +#include "../../math/select.hpp" +#include "../../math/sin_cos.hpp" +#include "../../simd/constants.hpp" +#include "../../simd/impl/function.hpp" +#include "../../simd/operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse) +{ + constexpr T pi_14 = c_pi<T, 1, 4>; + + vec<T, N> y = abs(x_full); + vec<T, N> scaled = y / pi_14; + + vec<T, N> k_real = floor(scaled); + vec<IT, N> k = innercast<IT>(k_real); + + vec<T, N> x = y - k_real * pi_14; + + mask<T, N> need_offset = (k & 1) != 0; + x = select(need_offset, x - pi_14, x); + + vec<IT, N> k_mod4 = k & 3; + inverse = (k_mod4 == 1) || (k_mod4 == 2); + return x; +} + +template <size_t N> +KFR_INTRINSIC vec<f32, N> tan(const vec<f32, N>& x_full) +{ + mask<f32, N> inverse; + vec<i32, N> quad; + const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse); + inverse = quad == 2 || quad == 6; + + constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01); + constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01); + constexpr f32 tan_c6 = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02); + constexpr f32 tan_c8 = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02); + constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03); + constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03); + + constexpr f32 cot_c2 = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01); + constexpr f32 cot_c4 = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02); + constexpr f32 cot_c6 = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03); + constexpr f32 cot_c8 = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04); + constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05); + constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07); + + const vec<f32, N> x2 = x * x; + const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, + tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12); + + const vec<f32, N> z = select(inverse, val / -x, val * x); + return mulsign(z, x_full); +} + +template <size_t N> +KFR_INTRINSIC vec<f64, N> tan(const vec<f64, N>& x_full) +{ + mask<f64, N> inverse; + vec<i64, N> quad; + const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse); + inverse = quad == 2 || quad == 6; + + constexpr f64 tan_c2 = 0x1.5555555555a3cp-2; + constexpr f64 tan_c4 = 0x1.11111110c4068p-3; + constexpr f64 tan_c6 = 0x1.ba1ba1ef36a4dp-5; + constexpr f64 tan_c8 = 0x1.664f3f4af7ce2p-6; + constexpr f64 tan_c10 = 0x1.226f2682a2616p-7; + constexpr f64 tan_c12 = 0x1.d6b440e73f61dp-9; + constexpr f64 tan_c14 = 0x1.7f06cdd30bd39p-10; + constexpr f64 tan_c16 = 0x1.2a8fab895738ep-11; + constexpr f64 tan_c18 = 0x1.34ff88cfdc292p-12; + constexpr f64 tan_c20 = -0x1.b4165ea04339fp-18; + constexpr f64 tan_c22 = 0x1.5f93701d86962p-13; + constexpr f64 tan_c24 = -0x1.5a13a3cdfb8c1p-14; + constexpr f64 tan_c26 = 0x1.77c69cef3306cp-15; + + constexpr f64 cot_c2 = -0x1.5555555555555p-2; + constexpr f64 cot_c4 = -0x1.6c16c16c16dcdp-6; + constexpr f64 cot_c6 = -0x1.1566abbff68a7p-9; + constexpr f64 cot_c8 = -0x1.bbd7794ef9999p-13; + constexpr f64 cot_c10 = -0x1.66a8ea1991906p-16; + constexpr f64 cot_c12 = -0x1.228220068711cp-19; + constexpr f64 cot_c14 = -0x1.d65ed2c45e21dp-23; + constexpr f64 cot_c16 = -0x1.897ead4a2f71dp-26; + constexpr f64 cot_c18 = -0x1.b592dc8656ec9p-31; + constexpr f64 cot_c20 = -0x1.3dc07078c46d6p-29; + constexpr f64 cot_c22 = 0x1.06c9e5c370edcp-29; + constexpr f64 cot_c24 = -0x1.217f50c9dbca3p-30; + constexpr f64 cot_c26 = 0x1.163ed8171a0c8p-32; + + const vec<f64, N> x2 = x * x; + const vec<f64, N> val = + trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6, cot_c8, tan_c8, + cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14, cot_c16, tan_c16, cot_c18, tan_c18, + cot_c20, tan_c20, cot_c22, tan_c22, cot_c24, tan_c24, cot_c26, tan_c26); + + const vec<f64, N> z = select(inverse, val / -x, val * x); + return mulsign(z, x_full); +} + +KFR_HANDLE_SCALAR_1_T(tan, flt_type<T>) +KFR_HANDLE_NOT_F_1(tan) + +template <typename T> +KFR_INTRINSIC flt_type<T> tandeg(const T& x) +{ + return tan(x * c_degtorad<flt_type<T>>); +} +} // namespace intrinsics +KFR_I_FN(tan) +KFR_I_FN(tandeg) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/interpolation.hpp b/include/kfr/math/interpolation.hpp @@ -0,0 +1,74 @@ +/** @addtogroup interpolation + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "select.hpp" +#include "sin_cos.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template <typename T, typename M> +KFR_FUNCTION T nearest(M mu, T x1, T x2) +{ + return select(mu < M(0.5), x1, x2); +} + +template <typename T, typename M> +KFR_FUNCTION T linear(M mu, T x1, T x2) +{ + return mix(mu, x1, x2); +} + +template <typename T, typename M> +KFR_FUNCTION T cosine(M mu, T x1, T x2) +{ + return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2); +} + +template <typename T, typename M> +KFR_FUNCTION T cubic(M mu, T x0, T x1, T x2, T x3) +{ + const T a0 = x3 - x2 - x0 + x1; + const T a1 = x0 - x1 - a0; + const T a2 = x2 - x0; + const T a3 = x1; + return horner(mu, a0, a1, a2, a3); +} + +template <typename T, typename M> +KFR_FUNCTION T catmullrom(M mu, T x0, T x1, T x2, T x3) +{ + const T a0 = T(0.5) * (x3 - x0) - T(1.5) * (x2 - x1); + const T a1 = x0 - T(2.5) * x1 + T(2) * x2 - T(0.5) * x3; + const T a2 = T(0.5) * (x2 - x0); + const T a3 = x1; + return horner(mu, a0, a1, a2, a3); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/log_exp.hpp b/include/kfr/math/log_exp.hpp @@ -0,0 +1,232 @@ +/** @addtogroup exponential + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/log_exp.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Returns e raised to the given power x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> exp(const T1& x) +{ + return intrinsics::exp(x); +} + +/// @brief Returns e raised to the given power x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::exp, E1> exp(E1&& x) +{ + return { fn::exp(), std::forward<E1>(x) }; +} + +/// @brief Returns 2 raised to the given power x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> exp2(const T1& x) +{ + return intrinsics::exp2(x); +} + +/// @brief Returns 2 raised to the given power x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::exp2, E1> exp2(E1&& x) +{ + return { fn::exp2(), std::forward<E1>(x) }; +} + +/// @brief Returns 10 raised to the given power x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> exp10(const T1& x) +{ + return intrinsics::exp10(x); +} + +/// @brief Returns 10 raised to the given power x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::exp10, E1> exp10(E1&& x) +{ + return { fn::exp10(), std::forward<E1>(x) }; +} + +/// @brief Returns the natural logarithm of the x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> log(const T1& x) +{ + return intrinsics::log(x); +} + +/// @brief Returns the natural logarithm of the x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::log, E1> log(E1&& x) +{ + return { fn::log(), std::forward<E1>(x) }; +} + +/// @brief Returns the binary (base-2) logarithm of the x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> log2(const T1& x) +{ + return intrinsics::log2(x); +} + +/// @brief Returns the binary (base-2) logarithm of the x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::log2, E1> log2(E1&& x) +{ + return { fn::log2(), std::forward<E1>(x) }; +} + +/// @brief Returns the common (base-10) logarithm of the x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> log10(const T1& x) +{ + return intrinsics::log10(x); +} + +/// @brief Returns the common (base-10) logarithm of the x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::log10, E1> log10(E1&& x) +{ + return { fn::log10(), std::forward<E1>(x) }; +} + +/// @brief Returns the rounded binary (base-2) logarithm of the x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> logb(const T1& x) +{ + return intrinsics::logb(x); +} + +/// @brief Returns the rounded binary (base-2) logarithm of the x. Version that accepts and returns +/// expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::logb, E1> logb(E1&& x) +{ + return { fn::logb(), std::forward<E1>(x) }; +} + +/// @brief Returns the logarithm of the x with base y. +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> +KFR_FUNCTION flt_type<common_type<T1, T2>> logn(const T1& x, const T2& y) +{ + return intrinsics::logn(x, y); +} + +/// @brief Returns the logarithm of the x with base y. Accepts and returns expressions. +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_FUNCTION internal::expression_function<fn::logn, E1, E2> logn(E1&& x, E2&& y) +{ + return { fn::logn(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/// @brief Returns log(x) * y. +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> +KFR_FUNCTION flt_type<common_type<T1, T2>> logm(const T1& x, const T2& y) +{ + return intrinsics::logm(x, y); +} + +/// @brief Returns log(x) * y. Accepts and returns expressions. +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_FUNCTION internal::expression_function<fn::logm, E1, E2> logm(E1&& x, E2&& y) +{ + return { fn::logm(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/// @brief Returns exp(x * m + a). +template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> +KFR_FUNCTION flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& y, const T3& z) +{ + return intrinsics::exp_fmadd(x, y, z); +} + +/// @brief Returns exp(x * m + a). Accepts and returns expressions. +template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> +KFR_FUNCTION internal::expression_function<fn::exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z) +{ + return { fn::exp_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) }; +} + +/// @brief Returns log(x) * m + a. +template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> +KFR_FUNCTION flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& y, const T3& z) +{ + return intrinsics::log_fmadd(x, y, z); +} + +/// @brief Returns log(x) * m + a. Accepts and returns expressions. +template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> +KFR_FUNCTION internal::expression_function<fn::log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z) +{ + return { fn::log_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) }; +} + +/// @brief Returns the x raised to the given power y. +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> +KFR_FUNCTION flt_type<common_type<T1, T2>> pow(const T1& x, const T2& y) +{ + return intrinsics::pow(x, y); +} + +/// @brief Returns the x raised to the given power y. Accepts and returns expressions. +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_FUNCTION internal::expression_function<fn::pow, E1, E2> pow(E1&& x, E2&& y) +{ + return { fn::pow(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/// @brief Returns the real nth root of the x. +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)> +KFR_FUNCTION flt_type<common_type<T1, T2>> root(const T1& x, const T2& y) +{ + return intrinsics::root(x, y); +} + +/// @brief Returns the real nth root of the x. Accepts and returns expressions. +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_FUNCTION internal::expression_function<fn::root, E1, E2> root(E1&& x, E2&& y) +{ + return { fn::root(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/// @brief Returns the cube root of the x. +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> cbrt(const T1& x) +{ + return intrinsics::cbrt(x); +} + +/// @brief Returns the cube root of the x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cbrt, E1> cbrt(E1&& x) +{ + return { fn::cbrt(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/logical.hpp b/include/kfr/math/logical.hpp @@ -0,0 +1,54 @@ +/** @addtogroup logical + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/logical.hpp" + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns x[0] && x[1] && ... && x[N-1] + */ +template <typename T, size_t N> +KFR_INTRINSIC bool all(const mask<T, N>& x) +{ + return intrinsics::bittestall(x.asvec()); +} + +/** + * @brief Returns x[0] || x[1] || ... || x[N-1] + */ +template <typename T, size_t N> +KFR_INTRINSIC bool any(const mask<T, N>& x) +{ + return intrinsics::bittestany(x.asvec()); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/min_max.hpp b/include/kfr/math/min_max.hpp @@ -0,0 +1,111 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/min_max.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns the smaller of two values. + */ +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout min(const T1& x, const T2& y) +{ + return intrinsics::min(x, y); +} + +/** + * @brief Returns the smaller of two values. Accepts and returns expressions. + */ +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::min, E1, E2> min(E1&& x, E2&& y) +{ + return { fn::min(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/** + * @brief Returns the greater of two values. + */ +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout max(const T1& x, const T2& y) +{ + return intrinsics::max(x, y); +} + +/** + * @brief Returns the greater of two values. Accepts and returns expressions. + */ +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::max, E1, E2> max(E1&& x, E2&& y) +{ + return { fn::max(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/** + * @brief Returns the smaller in magnitude of two values. + */ +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout absmin(const T1& x, const T2& y) +{ + return intrinsics::absmin(x, y); +} + +/** + * @brief Returns the smaller in magnitude of two values. Accepts and returns expressions. + */ +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::absmin, E1, E2> absmin(E1&& x, E2&& y) +{ + return { fn::absmin(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/** + * @brief Returns the greater in magnitude of two values. + */ +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout absmax(const T1& x, const T2& y) +{ + return intrinsics::absmax(x, y); +} + +/** + * @brief Returns the greater in magnitude of two values. Accepts and returns expressions. + */ +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::absmax, E1, E2> absmax(E1&& x, E2&& y) +{ + return { fn::absmax(), std::forward<E1>(x), std::forward<E2>(y) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/modzerobessel.hpp b/include/kfr/math/modzerobessel.hpp @@ -0,0 +1,47 @@ +/** @addtogroup other_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/modzerobessel.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION T1 modzerobessel(const T1& x) +{ + return intrinsics::modzerobessel(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1&& x) +{ + return { fn::modzerobessel(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/round.hpp b/include/kfr/math/round.hpp @@ -0,0 +1,163 @@ +/** @addtogroup round + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/round.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Returns the largest integer value not greater than x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 floor(const T1& x) +{ + return intrinsics::floor(x); +} + +/// @brief Returns the largest integer value not greater than x. Accepts and returns expressions. +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::floor, E1> floor(E1&& x) +{ + return { fn::floor(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 ceil(const T1& x) +{ + return intrinsics::ceil(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::ceil, E1> ceil(E1&& x) +{ + return { fn::ceil(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 round(const T1& x) +{ + return intrinsics::round(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::round, E1> round(E1&& x) +{ + return { fn::round(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 trunc(const T1& x) +{ + return intrinsics::trunc(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::trunc, E1> trunc(E1&& x) +{ + return { fn::trunc(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 fract(const T1& x) +{ + return intrinsics::fract(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::fract, E1> fract(E1&& x) +{ + return { fn::fract(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC itype<T1> ifloor(const T1& x) +{ + return intrinsics::ifloor(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::ifloor, E1> ifloor(E1&& x) +{ + return { fn::ifloor(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC itype<T1> iceil(const T1& x) +{ + return intrinsics::iceil(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::iceil, E1> iceil(E1&& x) +{ + return { fn::iceil(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC itype<T1> iround(const T1& x) +{ + return intrinsics::iround(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::iround, E1> iround(E1&& x) +{ + return { fn::iround(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC itype<T1> itrunc(const T1& x) +{ + return intrinsics::itrunc(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::itrunc, E1> itrunc(E1&& x) +{ + return { fn::itrunc(), std::forward<E1>(x) }; +} + +template <typename T, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC T fmod(const T& x, const T& y) +{ + return x - trunc(x / y) * y; +} +KFR_FN(fmod) + +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +constexpr KFR_INTRINSIC vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y) +{ + return x % y; +} +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y) +{ + return fmod(x, y); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/saturation.hpp b/include/kfr/math/saturation.hpp @@ -0,0 +1,65 @@ +/** @addtogroup saturation + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/saturation.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Adds two arguments using saturation +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout satadd(const T1& x, const T2& y) +{ + return intrinsics::satadd(x, y); +} + +/// @brief Creates an expression that adds two arguments using saturation +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::satadd, E1, E2> satadd(E1&& x, E2&& y) +{ + return { fn::satadd(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +/// @brief Subtracts two arguments using saturation +template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value), + typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout satsub(const T1& x, const T2& y) +{ + return intrinsics::satsub(x, y); +} + +/// @brief Creates an expression that subtracts two arguments using saturation +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::satsub, E1, E2> satsub(E1&& x, E2&& y) +{ + return { fn::satsub(), std::forward<E1>(x), std::forward<E2>(y) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/select.hpp b/include/kfr/math/select.hpp @@ -0,0 +1,59 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/select.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns x if m is true, otherwise return y. Order of the arguments is same as in ternary operator. + * @code + * return m ? x : y + * @endcode + */ +template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value), + typename Tout = subtype<common_type<T2, T3>>> +KFR_INTRINSIC vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y) +{ + static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types"); + return intrinsics::select(bitcast<Tout>(m.asvec()), innercast<Tout>(x), innercast<Tout>(y)); +} + +/** + * @brief Returns template expression that returns x if m is true, otherwise return y. Order of the arguments + * is same as in ternary operator. + */ +template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> +KFR_INTRINSIC internal::expression_function<fn::select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y) +{ + return { fn::select(), std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/sin_cos.hpp b/include/kfr/math/sin_cos.hpp @@ -0,0 +1,318 @@ +/** @addtogroup trigonometric + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/sin_cos.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns the trigonometric sine of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sin(const T1& x) +{ + return intrinsics::sin(x); +} + +/** + * @brief Returns the trigonometric sine of x. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sin, E1> sin(E1&& x) +{ + return { fn::sin(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric cosine of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> cos(const T1& x) +{ + return intrinsics::cos(x); +} + +/** + * @brief Returns the trigonometric cosine of x. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cos, E1> cos(E1&& x) +{ + return { fn::cos(), std::forward<E1>(x) }; +} + +/** + * @brief Returns an approximation of the trigonometric sine of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> fastsin(const T1& x) +{ + return intrinsics::fastsin(x); +} + +/** + * @brief Returns an approximation of the trigonometric sine of x. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::fastsin, E1> fastsin(E1&& x) +{ + return { fn::fastsin(), std::forward<E1>(x) }; +} + +/** + * @brief Returns an approximation of the trigonometric cosine of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> fastcos(const T1& x) +{ + return intrinsics::fastcos(x); +} + +/** + * @brief Returns an approximation of the trigonometric cosine of x. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::fastcos, E1> fastcos(E1&& x) +{ + return { fn::fastcos(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must + * be a vector. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sincos(const T1& x) +{ + return intrinsics::sincos(x); +} + +/** + * @brief Returns the trigonometric sine of the even elements of the x and + * cosine of the odd elements. x must be a vector. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sincos, E1> sincos(E1&& x) +{ + return { fn::sincos(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must + * be a vector. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> cossin(const T1& x) +{ + return intrinsics::cossin(x); +} + +/** + * @brief Returns the trigonometric cosine of the even elements of the x and + * sine of the odd elements. x must be a vector. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cossin, E1> cossin(E1&& x) +{ + return { fn::cossin(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric sine of the x (expressed in degrees). + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sindeg(const T1& x) +{ + return intrinsics::sindeg(x); +} + +/** + * @brief Returns the trigonometric sine of the x (expressed in degrees). Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sindeg, E1> sindeg(E1&& x) +{ + return { fn::sindeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric cosine of the x (expressed in degrees). + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> cosdeg(const T1& x) +{ + return intrinsics::cosdeg(x); +} + +/** + * @brief Returns the trigonometric cosine of the x (expressed in degrees). Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cosdeg, E1> cosdeg(E1&& x) +{ + return { fn::cosdeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns an approximation of the trigonometric sine of the x (expressed in degrees). + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> fastsindeg(const T1& x) +{ + return intrinsics::fastsindeg(x); +} + +/** + * @brief Returns an approximation of the trigonometric sine of the x + * (expressed in degrees). Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::fastsindeg, E1> fastsindeg(E1&& x) +{ + return { fn::fastsindeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns an approximation of the trigonometric cosine of the x (expressed in degrees). + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> fastcosdeg(const T1& x) +{ + return intrinsics::fastcosdeg(x); +} + +/** + * @brief Returns an approximation of the trigonometric cosine of the x + * (expressed in degrees). Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::fastcosdeg, E1> fastcosdeg(E1&& x) +{ + return { fn::fastcosdeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must + * be a vector and expressed in degrees. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sincosdeg(const T1& x) +{ + return intrinsics::sincosdeg(x); +} + +/** + * @brief Returns the trigonometric sine of the even elements of the x and + * cosine of the odd elements. x must be expressed in degrees. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sincosdeg, E1> sincosdeg(E1&& x) +{ + return { fn::sincosdeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must + * be a vector and expressed in degrees. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> cossindeg(const T1& x) +{ + return intrinsics::cossindeg(x); +} + +/** + * @brief Returns the trigonometric cosine of the even elements of the x and + * sine of the odd elements. x must be expressed in degrees. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::cossindeg, E1> cossindeg(E1&& x) +{ + return { fn::cossindeg(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the sinc function of x. + * \f[ + * sinc(x) = \frac{sin(x)}{x} + * \f] + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> sinc(const T1& x) +{ + return intrinsics::sinc(x); +} + +/** + * @brief Returns the sinc function of x. Accepts and returns expressions. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::sinc, E1> sinc(E1&& x) +{ + return { fn::sinc(), std::forward<E1>(x) }; +} + +/** + * @brief Returns the trigonometric sine of the angle 2x using sin(x) and cos(x). + */ +template <typename T> +KFR_INTRINSIC T sin2x(const T& sinx, const T& cosx) +{ + return 2 * sinx * cosx; +} + +/** + * @brief Returns the trigonometric sine of the angle 3x using already computed sin(x) and cos(x). + */ +template <typename T> +KFR_INTRINSIC T sin3x(const T& sinx, const T& cosx) +{ + return sinx * (-1 + 4 * sqr(cosx)); +} + +/** + * @brief Returns the trigonometric cosine of the angle 2x using already computed sin(x) and cos(x). + */ +template <typename T> +KFR_INTRINSIC T cos2x(const T& sinx, const T& cosx) +{ + return sqr(cosx) - sqr(sinx); +} + +/** + * @brief Returns the trigonometric cosine of the angle 3x using already computed sin(x) and cos(x). + */ +template <typename T> +KFR_INTRINSIC T cos3x(const T& sinx, const T& cosx) +{ + return cosx * (1 - 4 * sqr(sinx)); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/sqrt.hpp b/include/kfr/math/sqrt.hpp @@ -0,0 +1,53 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/sqrt.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Returns the positive square root of the x. \f$\sqrt{x}\f$ + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC flt_type<T1> sqrt(const T1& x) +{ + return intrinsics::sqrt(x); +} + +/** + * @brief Returns template expression that returns the positive square root of the x. \f$\sqrt{x}\f$ + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::sqrt, E1> sqrt(E1&& x) +{ + return { fn::sqrt(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/tan.hpp b/include/kfr/math/tan.hpp @@ -0,0 +1,59 @@ +/** @addtogroup trigonometric + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/tan.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> tan(const T1& x) +{ + return intrinsics::tan(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::tan, E1> tan(E1&& x) +{ + return { fn::tan(), std::forward<E1>(x) }; +} + +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_FUNCTION flt_type<T1> tandeg(const T1& x) +{ + return intrinsics::tandeg(x); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_FUNCTION internal::expression_function<fn::tandeg, E1> tandeg(E1&& x) +{ + return { fn::tandeg(), std::forward<E1>(x) }; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/runtime.hpp b/include/kfr/runtime.hpp @@ -0,0 +1,26 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "runtime/cpuid.hpp" +#include "runtime/cpuid_auto.hpp" diff --git a/include/kfr/runtime/cpuid.hpp b/include/kfr/runtime/cpuid.hpp @@ -0,0 +1,300 @@ +/** @addtogroup cpuid + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#ifdef _MSC_VER +#include <intrin.h> +#endif + +#include "../simd/platform.hpp" +#include "../simd/types.hpp" +#include <cstring> + +namespace kfr +{ +#ifdef CMT_ARCH_X86 + +struct cpu_features +{ + u32 max; + u32 exmax; + u32 isIntel : 1; + u32 isAMD : 1; + u32 has3DNOW : 1; + u32 has3DNOWEXT : 1; + u32 hasABM : 1; + u32 hasADX : 1; + u32 hasAES : 1; + u32 hasAVX : 1; + u32 hasAVX2 : 1; + u32 hasAVXOSSUPPORT : 1; + u32 hasAVX512OSSUPPORT : 1; + u32 hasAVX512CD : 1; + u32 hasAVX512ER : 1; + u32 hasAVX512F : 1; + u32 hasAVX512DQ : 1; + u32 hasAVX512PF : 1; + u32 hasAVX512BW : 1; + u32 hasAVX512VL : 1; + u32 hasBMI1 : 1; + u32 hasBMI2 : 1; + u32 hasCLFSH : 1; + u32 hasCMOV : 1; + u32 hasCMPXCHG16B : 1; + u32 hasCX8 : 1; + u32 hasERMS : 1; + u32 hasF16C : 1; + u32 hasFMA : 1; + u32 hasFSGSBASE : 1; + u32 hasFXSR : 1; + u32 hasHLE : 1; + u32 hasINVPCID : 1; + u32 hasLAHF : 1; + u32 hasLZCNT : 1; + u32 hasMMX : 1; + u32 hasMMXEXT : 1; + u32 hasMONITOR : 1; + u32 hasMOVBE : 1; + u32 hasMSR : 1; + u32 hasOSXSAVE : 1; + u32 hasPCLMULQDQ : 1; + u32 hasPOPCNT : 1; + u32 hasPREFETCHWT1 : 1; + u32 hasRDRAND : 1; + u32 hasRDSEED : 1; + u32 hasRDTSCP : 1; + u32 hasRTM : 1; + u32 hasSEP : 1; + u32 hasSHA : 1; + u32 hasSSE : 1; + u32 hasSSE2 : 1; + u32 hasSSE3 : 1; + u32 hasSSE41 : 1; + u32 hasSSE42 : 1; + u32 hasSSE4a : 1; + u32 hasSSSE3 : 1; + u32 hasSYSCALL : 1; + u32 hasTBM : 1; + u32 hasXOP : 1; + u32 hasXSAVE : 1; + u32 padding1 : 6; + char vendor[17]; + char model[49]; + char padding2[2]; +}; + +namespace internal_generic +{ + +struct cpu_data +{ + u32 data[4]; +}; + +#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG +KFR_INTRINSIC u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx) +{ + __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc)); + return 1; +} +KFR_INTRINSIC void cpuid(u32* ptr, u32 func, u32 subfunc = 0) +{ + get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]); +} +KFR_INTRINSIC u32 get_xcr0() +{ + u32 xcr0; + __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); + return xcr0; +} +#elif defined CMT_COMPILER_MSVC + +KFR_INTRINSIC void cpuid(u32* ptr, u32 func, u32 subfunc = 0) +{ + __cpuidex((int*)ptr, (int)func, (int)subfunc); +} +KFR_INTRINSIC u32 get_xcr0() +{ +#ifdef _XCR_XFEATURE_ENABLED_MASK + unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + return (u32)Result; +#else + return 0; +#endif +} +#endif + +template <size_t = 0> +cpu_t detect_cpu() +{ + cpu_features c; + memset(&c, 0, sizeof(c)); + cpu_data data0; + cpu_data exdata0; + + u32 f_1_ECX(0); + u32 f_1_EDX(0); + u32 f_7_EBX(0); + u32 f_7_ECX(0); + u32 f_81_ECX(0); + u32 f_81_EDX(0); + + cpuid(data0.data, 0); + c.max = static_cast<u32>(data0.data[0]); + cpuid(exdata0.data, 0x80000000); + c.exmax = static_cast<u32>(exdata0.data[0]); + + *ptr_cast<u32>(c.vendor) = static_cast<u32>(data0.data[1]); + *ptr_cast<u32>(c.vendor + 4) = static_cast<u32>(data0.data[3]); + *ptr_cast<u32>(c.vendor + 8) = static_cast<u32>(data0.data[2]); + + c.isIntel = strncmp(c.vendor, "GenuineIntel", sizeof(c.vendor)) == 0 ? 1 : 0; + c.isAMD = strncmp(c.vendor, "AuthenticAMD", sizeof(c.vendor)) == 0 ? 1 : 0; + + if (c.max >= 1) + { + cpu_data data1; + cpuid(data1.data, 1); + f_1_ECX = static_cast<u32>(data1.data[2]); + f_1_EDX = static_cast<u32>(data1.data[3]); + } + + if (c.max >= 7) + { + cpu_data data7; + cpuid(data7.data, 7); + f_7_EBX = static_cast<u32>(data7.data[1]); + f_7_ECX = static_cast<u32>(data7.data[2]); + } + + if (c.exmax >= 0x80000001) + { + cpu_data data81; + cpuid(data81.data, 0x80000001); + f_81_ECX = static_cast<u32>(data81.data[2]); + f_81_EDX = static_cast<u32>(data81.data[3]); + } + + if (c.exmax >= 0x80000004) + { + cpu_data data82; + cpu_data data83; + cpu_data data84; + cpuid(data82.data, 0x80000002); + cpuid(data83.data, 0x80000003); + cpuid(data84.data, 0x80000004); + memcpy(c.model, data82.data, sizeof(cpu_data)); + memcpy(c.model + 16, data83.data, sizeof(cpu_data)); + memcpy(c.model + 32, data84.data, sizeof(cpu_data)); + } + + c.hasSSE3 = f_1_ECX >> 0 & 1; + c.hasPCLMULQDQ = f_1_ECX >> 1 & 1; + c.hasMONITOR = f_1_ECX >> 3 & 1; + c.hasSSSE3 = f_1_ECX >> 9 & 1; + c.hasFMA = f_1_ECX >> 12 & 1; + c.hasCMPXCHG16B = f_1_ECX >> 13 & 1; + c.hasSSE41 = f_1_ECX >> 19 & 1; + c.hasSSE42 = f_1_ECX >> 20 & 1; + c.hasMOVBE = f_1_ECX >> 22 & 1; + c.hasPOPCNT = f_1_ECX >> 23 & 1; + c.hasAES = f_1_ECX >> 25 & 1; + c.hasXSAVE = f_1_ECX >> 26 & 1; + c.hasOSXSAVE = f_1_ECX >> 27 & 1; + c.hasAVX = f_1_ECX >> 28 & 1; + c.hasF16C = f_1_ECX >> 29 & 1; + c.hasRDRAND = f_1_ECX >> 30 & 1; + c.hasMSR = f_1_EDX >> 5 & 1; + c.hasCX8 = f_1_EDX >> 8 & 1; + c.hasSEP = f_1_EDX >> 11 & 1; + c.hasCMOV = f_1_EDX >> 15 & 1; + c.hasCLFSH = f_1_EDX >> 19 & 1; + c.hasMMX = f_1_EDX >> 23 & 1; + c.hasFXSR = f_1_EDX >> 24 & 1; + c.hasSSE = f_1_EDX >> 25 & 1; + c.hasSSE2 = f_1_EDX >> 26 & 1; + c.hasFSGSBASE = f_7_EBX >> 0 & 1; + c.hasBMI1 = f_7_EBX >> 3 & 1; + c.hasHLE = c.isIntel && f_7_EBX >> 4 & 1; + c.hasAVX2 = f_7_EBX >> 5 & 1; + c.hasBMI2 = f_7_EBX >> 8 & 1; + c.hasERMS = f_7_EBX >> 9 & 1; + c.hasINVPCID = f_7_EBX >> 10 & 1; + c.hasRTM = c.isIntel && f_7_EBX >> 11 & 1; + c.hasAVX512F = f_7_EBX >> 16 & 1; + c.hasAVX512DQ = f_7_EBX >> 17 & 1; + c.hasRDSEED = f_7_EBX >> 18 & 1; + c.hasADX = f_7_EBX >> 19 & 1; + c.hasAVX512PF = f_7_EBX >> 26 & 1; + c.hasAVX512ER = f_7_EBX >> 27 & 1; + c.hasAVX512CD = f_7_EBX >> 28 & 1; + c.hasSHA = f_7_EBX >> 29 & 1; + c.hasAVX512BW = f_7_EBX >> 30 & 1; + c.hasAVX512VL = f_7_EBX >> 31 & 1; + c.hasPREFETCHWT1 = f_7_ECX >> 0 & 1; + c.hasLAHF = f_81_ECX >> 0 & 1; + c.hasLZCNT = c.isIntel && f_81_ECX >> 5 & 1; + c.hasABM = c.isAMD && f_81_ECX >> 5 & 1; + c.hasSSE4a = c.isAMD && f_81_ECX >> 6 & 1; + c.hasXOP = c.isAMD && f_81_ECX >> 11 & 1; + c.hasTBM = c.isAMD && f_81_ECX >> 21 & 1; + c.hasSYSCALL = c.isIntel && f_81_EDX >> 11 & 1; + c.hasMMXEXT = c.isAMD && f_81_EDX >> 22 & 1; + c.hasRDTSCP = c.isIntel && f_81_EDX >> 27 & 1; + c.has3DNOWEXT = c.isAMD && f_81_EDX >> 30 & 1; + c.has3DNOW = c.isAMD && f_81_EDX >> 31 & 1; + + c.hasAVXOSSUPPORT = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06; + c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0; + + if (c.hasAVX512F && c.hasAVX512CD && c.hasAVX512VL && c.hasAVX512BW && c.hasAVX512DQ && + c.hasAVX512OSSUPPORT) + return cpu_t::avx512; + if (c.hasAVX2 && c.hasAVXOSSUPPORT) + return cpu_t::avx2; + if (c.hasAVX && c.hasAVXOSSUPPORT) + return cpu_t::avx1; + if (c.hasSSE41) + return cpu_t::sse41; + if (c.hasSSSE3) + return cpu_t::ssse3; + if (c.hasSSE3) + return cpu_t::sse3; + if (c.hasSSE2) + return cpu_t::sse2; + return cpu_t::lowest; +} +} // namespace internal_generic +#else + +template <size_t = 0> +cpu_t detect_cpu() +{ + return cpu_t::native; +} + +#endif +} // namespace kfr diff --git a/include/kfr/runtime/cpuid_auto.hpp b/include/kfr/runtime/cpuid_auto.hpp @@ -0,0 +1,62 @@ +/** @addtogroup cpuid + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "cpuid.hpp" + +namespace kfr +{ + +namespace internal_generic +{ + +KFR_INTRINSIC cpu_t& cpu_v() +{ + static cpu_t v1 = cpu_t::native; + return v1; +} + +KFR_INTRINSIC char init_cpu_v() +{ + cpu_v() = detect_cpu<0>(); + return 0; +} + +KFR_INTRINSIC char init_dummyvar() +{ + static char dummy = init_cpu_v(); + return dummy; +} + +static char dummyvar = init_dummyvar(); +} // namespace internal_generic + +/** + * @brief Returns cpu instruction set detected at runtime. + */ +KFR_FUNCTION cpu_t get_cpu() { return internal_generic::cpu_v(); } + +} // namespace kfr diff --git a/include/kfr/simd.hpp b/include/kfr/simd.hpp @@ -0,0 +1,36 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "simd/comparison.hpp" +#include "simd/complex.hpp" +#include "simd/constants.hpp" +#include "simd/digitreverse.hpp" +#include "simd/horizontal.hpp" +#include "simd/mask.hpp" +#include "simd/operators.hpp" +#include "simd/platform.hpp" +#include "simd/read_write.hpp" +#include "simd/shuffle.hpp" +#include "simd/types.hpp" +#include "simd/vec.hpp" diff --git a/include/kfr/simd/comparison.hpp b/include/kfr/simd/comparison.hpp @@ -0,0 +1,152 @@ +/** @addtogroup logical + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "constants.hpp" +#include "impl/function.hpp" +#include "vec.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template <typename T1, typename T2> +inline maskfor<common_type<T1, T2>> equal(const T1& x, const T2& y) +{ + return x == y; +} +template <typename T1, typename T2> +inline maskfor<common_type<T1, T2>> notequal(const T1& x, const T2& y) +{ + return x != y; +} +template <typename T1, typename T2> +inline maskfor<common_type<T1, T2>> less(const T1& x, const T2& y) +{ + return x < y; +} +template <typename T1, typename T2> +inline maskfor<common_type<T1, T2>> greater(const T1& x, const T2& y) +{ + return x > y; +} +template <typename T1, typename T2> +inline maskfor<common_type<T1, T2>> lessorequal(const T1& x, const T2& y) +{ + return x <= y; +} +template <typename T1, typename T2> +inline maskfor<common_type<T1, T2>> greaterorequal(const T1& x, const T2& y) +{ + return x >= y; +} +KFR_FN(equal) +KFR_FN(notequal) +KFR_FN(less) +KFR_FN(greater) +KFR_FN(lessorequal) +KFR_FN(greaterorequal) + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::equal, E1, E2> operator==(E1&& e1, E2&& e2) +{ + return { fn::equal(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::notequal, E1, E2> operator!=(E1&& e1, E2&& e2) +{ + return { fn::notequal(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::less, E1, E2> operator<(E1&& e1, E2&& e2) +{ + return { fn::less(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::greater, E1, E2> operator>(E1&& e1, E2&& e2) +{ + return { fn::greater(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::lessorequal, E1, E2> operator<=(E1&& e1, E2&& e2) +{ + return { fn::lessorequal(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::greaterorequal, E1, E2> operator>=(E1&& e1, E2&& e2) +{ + return { fn::greaterorequal(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> isnan(const vec<T, N>& x) +{ + return x != x; +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> isinf(const vec<T, N>& x) +{ + return x == avoid_odr_use(constants<T>::infinity) || x == -constants<T>::infinity; +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> isfinite(const vec<T, N>& x) +{ + return !isnan(x) && !isinf(x); +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> isnegative(const vec<T, N>& x) +{ + return (x & constants<T>::highbitmask()) != 0; +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> ispositive(const vec<T, N>& x) +{ + return !isnegative(x); +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> iszero(const vec<T, N>& x) +{ + return x == T(); +} + +template <typename T1, typename T2, typename T3> +KFR_INTRINSIC maskfor<common_type<T1, T2, T3>> inrange(const T1& x, const T2& min, const T3& max) +{ + return x >= min && x <= max; +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/complex.hpp b/include/kfr/simd/complex.hpp @@ -0,0 +1,468 @@ +/** @addtogroup complex + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once +#include "constants.hpp" +#include "impl/function.hpp" +#include "operators.hpp" + +#ifdef KFR_STD_COMPLEX +#include <complex> +#endif + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4814)) + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +#ifdef KFR_STD_COMPLEX + +template <typename T> +using complex = std::complex<T>; + +#else +#ifndef KFR_CUSTOM_COMPLEX + +/** + * @brief Represents the complex numbers. If KFR_STD_COMPLEX is defined, then kfr::complex is an alias for + * std::complex. + */ +template <typename T> +struct complex +{ + static_assert(is_simd_type<T>::value, "Incorrect type for complex"); + constexpr static bool is_pod = true; + constexpr complex() CMT_NOEXCEPT = default; + KFR_MEM_INTRINSIC constexpr complex(T re) CMT_NOEXCEPT : re(re), im(0) {} + KFR_MEM_INTRINSIC constexpr complex(T re, T im) CMT_NOEXCEPT : re(re), im(im) {} + constexpr complex(const complex&) CMT_NOEXCEPT = default; + constexpr complex(complex&&) CMT_NOEXCEPT = default; + template <typename U> + KFR_MEM_INTRINSIC constexpr complex(const complex<U>& other) CMT_NOEXCEPT : re(static_cast<T>(other.re)), + im(static_cast<T>(other.im)) + { + } + template <typename U> + KFR_MEM_INTRINSIC constexpr complex(complex<U>&& other) CMT_NOEXCEPT : re(std::move(other.re)), + im(std::move(other.im)) + { + } +#ifdef CMT_COMPILER_GNU + constexpr complex& operator=(const complex&) CMT_NOEXCEPT = default; + constexpr complex& operator=(complex&&) CMT_NOEXCEPT = default; +#else + complex& operator=(const complex&) = default; + complex& operator=(complex&&) = default; +#endif + KFR_MEM_INTRINSIC constexpr const T& real() const CMT_NOEXCEPT { return re; } + KFR_MEM_INTRINSIC constexpr const T& imag() const CMT_NOEXCEPT { return im; } + KFR_MEM_INTRINSIC constexpr void real(T value) CMT_NOEXCEPT { re = value; } + KFR_MEM_INTRINSIC constexpr void imag(T value) CMT_NOEXCEPT { im = value; } + T re; + T im; + + KFR_MEM_INTRINSIC friend complex operator+(const complex& x, const complex& y) + { + return (make_vector(x) + make_vector(y))[0]; + } + KFR_MEM_INTRINSIC friend complex operator-(const complex& x, const complex& y) + { + return (make_vector(x) - make_vector(y))[0]; + } + KFR_MEM_INTRINSIC friend complex operator*(const complex& x, const complex& y) + { + return (make_vector(x) * make_vector(y))[0]; + } + KFR_MEM_INTRINSIC friend complex operator/(const complex& x, const complex& y) + { + return (make_vector(x) / make_vector(y))[0]; + } + + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator+(const complex& x, const U& y) + { + return static_cast<C>(x) + static_cast<C>(y); + } + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator-(const complex& x, const U& y) + { + return static_cast<C>(x) - static_cast<C>(y); + } + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator*(const complex& x, const U& y) + { + return static_cast<C>(x) * static_cast<C>(y); + } + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator/(const complex& x, const U& y) + { + return static_cast<C>(x) / static_cast<C>(y); + } + + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator+(const U& x, const complex& y) + { + return static_cast<C>(x) + static_cast<C>(y); + } + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator-(const U& x, const complex& y) + { + return static_cast<C>(x) - static_cast<C>(y); + } + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator*(const U& x, const complex& y) + { + return static_cast<C>(x) * static_cast<C>(y); + } + template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>> + KFR_MEM_INTRINSIC friend C operator/(const U& x, const complex& y) + { + return static_cast<C>(x) / static_cast<C>(y); + } + KFR_MEM_INTRINSIC friend complex operator-(const complex& x) { return (-make_vector(x))[0]; } + KFR_MEM_INTRINSIC friend complex operator+(const complex& x) { return x; } +}; +#endif +#endif +} // namespace CMT_ARCH_NAME +} // namespace kfr +namespace cometa +{ +template <typename T> +struct compound_type_traits<kfr::complex<T>> +{ + constexpr static size_t width = 2; + constexpr static size_t deep_width = width * compound_type_traits<T>::width; + using subtype = T; + using deep_subtype = cometa::deep_subtype<T>; + constexpr static bool is_scalar = false; + constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; + template <typename U> + using rebind = kfr::complex<U>; + template <typename U> + using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>; + + static constexpr subtype at(const kfr::complex<T>& value, size_t index) + { + return index == 0 ? value.real() : value.imag(); + } +}; +} // namespace cometa +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/// @brief Alias for complex<f32> +using c32 = complex<f32>; + +/// @brief Alias for complex<f64> +using c64 = complex<f64>; + +/// @brief Alias for complex<fbase> +using cbase = complex<fbase>; + +namespace intrinsics +{ +template <typename T> +constexpr inline complex<T> vcomplex(const vec<T, 2>& v) +{ + return complex<T>(v.front(), v.back()); +} +template <typename T> +constexpr inline vec<T, 2> vcomplex(const complex<T>& v) +{ + return vec<T, 2>(v.real(), v.imag()); +} +template <typename T> +constexpr inline simd<T, 2> vvcomplex(const complex<T>& v) +{ + return intrinsics::simd_make(ctype<T>, v.real(), v.imag()); +} +} // namespace intrinsics + +template <typename T, size_t N, size_t... indices> +KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevector(const vec<complex<T>, N>& x, + csizes_t<indices...>) CMT_NOEXCEPT +{ + return intrinsics::simd_shuffle(intrinsics::simd_t<T, N>{}, x.v, scale<2, indices...>(), overload_auto); +} +template <typename T, size_t N, size_t... indices> +KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevectors(const vec<complex<T>, N>& x, + const vec<T, N>& y, + csizes_t<indices...>) CMT_NOEXCEPT +{ + return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, scale<2, indices...>(), + overload_auto); +} +namespace internal +{ +template <typename T> +struct compoundcast<complex<T>> +{ + static vec<T, 2> to_flat(const complex<T>& x) { return { x.real(), x.imag() }; } + static complex<T> from_flat(const vec<T, 2>& x) { return { x.front(), x.back() }; } +}; + +template <typename T, size_t N> +struct compoundcast<vec<complex<T>, N>> +{ + static vec<T, N * 2> to_flat(const vec<complex<T>, N>& x) { return x.flatten(); } + static vec<complex<T>, N / 2> from_flat(const vec<T, N>& x) + { + return vec<complex<T>, N / 2>::from_flatten(x); + } +}; +} // namespace internal + +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<complex<T>, N / 2> ccomp(const vec<T, N>& x) +{ + return vec<complex<T>, N / 2>::from_flatten(x); +} + +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N * 2> cdecom(const vec<complex<T>, N>& x) +{ + return x.flatten(); +} + +/// @brief Returns vector of complex values with real part duplicated +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x) +{ + return ccomp(dupeven(cdecom(x))); +} +KFR_FN(cdupreal) + +/// @brief Returns vector of complex values with imaginary part duplicated +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x) +{ + return ccomp(dupodd(cdecom(x))); +} +KFR_FN(cdupimag) + +/// @brief Returns vector of complex values with real and imaginary parts swapped +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x) +{ + return ccomp(swap<2>(cdecom(x))); +} +KFR_FN(cswapreim) + +/// @brief Returns vector of complex values with real part negated +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x) +{ + return x ^ complex<T>(-T(), T()); +} +KFR_FN(cnegreal) + +/// @brief Returns vector of complex values with imaginary part negated +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x) +{ + return x ^ complex<T>(T(), -T()); +} +KFR_FN(cnegimag) + +namespace internal +{ +template <typename T> +struct is_complex_impl : std::false_type +{ +}; +template <typename T> +struct is_complex_impl<complex<T>> : std::true_type +{ +}; + +// vector<complex> to vector<complex> +template <typename To, typename From, size_t N> +struct conversion<vec<complex<To>, N>, vec<complex<From>, N>> +{ + static_assert(!is_compound<To>::value, ""); + static_assert(!is_compound<From>::value, ""); + static vec<complex<To>, N> cast(const vec<complex<From>, N>& value) + { + return vec<To, N * 2>(value.flatten()).v; + } +}; + +// vector to vector<complex> +template <typename To, typename From, size_t N> +struct conversion<vec<complex<To>, N>, vec<From, N>> +{ + static_assert(!is_compound<To>::value, ""); + static_assert(!is_compound<From>::value, ""); + static vec<complex<To>, N> cast(const vec<From, N>& value) + { + const vec<To, N> casted = static_cast<vec<To, N>>(value); + return interleave(casted, zerovector(casted)).v; + } +}; + +} // namespace internal + +/// @brief Returns the real part of the complex value +template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)> +constexpr KFR_INTRINSIC T real(const T& value) +{ + return value; +} + +/// @brief Returns the real part of the complex value +template <typename T> +constexpr KFR_INTRINSIC T real(const complex<T>& value) +{ + return value.real(); +} + +/// @brief Returns the real part of the complex value +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> real(const vec<complex<T>, N>& value) +{ + return even(cdecom(value)); +} + +template <typename T> +using realtype = decltype(kfr::real(std::declval<T>())); +template <typename T> +using realftype = ftype<decltype(kfr::real(std::declval<T>()))>; + +KFR_FN(real) + +/// @brief Returns the real part of the complex value +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::real, E1> real(E1&& x) +{ + return { {}, std::forward<E1>(x) }; +} + +/// @brief Returns the imaginary part of the complex value +template <typename T> +constexpr KFR_INTRINSIC T imag(const complex<T>& value) +{ + return value.imag(); +} + +/// @brief Returns the imaginary part of the complex value +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> imag(const vec<complex<T>, N>& value) +{ + return odd(cdecom(value)); +} +KFR_FN(imag) + +/// @brief Returns the imaginary part of the complex value +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::imag, E1> imag(E1&& x) +{ + return { {}, std::forward<E1>(x) }; +} + +/// @brief Constructs complex value from real and imaginary parts +template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>> +constexpr KFR_INTRINSIC vec<complex<T>, N> make_complex(const vec<T1, N>& real, + const vec<T2, N>& imag = T2(0)) +{ + return ccomp(interleave(innercast<T>(real), innercast<T>(imag))); +} + +/// @brief Constructs complex value from real and imaginary parts +template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>> +constexpr KFR_INTRINSIC complex<T> make_complex(T1 real, T2 imag = T2(0)) +{ + return complex<T>(innercast<T>(real), innercast<T>(imag)); +} + +namespace intrinsics +{ +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> cconj(const vec<complex<T>, N>& x) +{ + return cnegimag(x); +} +} // namespace intrinsics + +/// @brief Returns the complex conjugate of the complex number x +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +KFR_INTRINSIC T1 cconj(const T1& x) +{ + return intrinsics::cconj(x); +} + +template <size_t N> +struct vec_of_complex +{ + template <typename T> + using type = vec<complex<T>, N>; +}; +} // namespace CMT_ARCH_NAME + +template <typename T1, typename T2> +struct common_type_impl<kfr::complex<T1>, kfr::complex<T2>> : common_type_from_subtypes<T1, T2, kfr::complex> +{ +}; +template <typename T1, typename T2> +struct common_type_impl<kfr::complex<T1>, T2> : common_type_from_subtypes<T1, T2, kfr::complex> +{ +}; +template <typename T1, typename T2> +struct common_type_impl<T1, kfr::complex<T2>> : common_type_from_subtypes<T1, T2, kfr::complex> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>> + : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::vec<kfr::complex<T1>, N>, kfr::vec<kfr::complex<T2>, N>> + : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>> + : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::complex<T1>, kfr::vec<T2, N>> + : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::vec<T1, N>, kfr::complex<T2>> + : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type> +{ +}; +} // namespace kfr + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/simd/constants.hpp b/include/kfr/simd/constants.hpp @@ -0,0 +1,160 @@ +/** @addtogroup constants + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "types.hpp" +#include <limits> + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4309)) +CMT_PRAGMA_MSVC(warning(disable : 4146)) + +namespace kfr +{ + +#if CMT_COMPILER_GNU +constexpr double infinity = __builtin_inf(); +constexpr double qnan = __builtin_nan(""); +#else +constexpr double infinity = HUGE_VAL; +constexpr double qnan = NAN; +#endif +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Woverflow") + +template <typename T> +struct scalar_constants +{ + constexpr static T pi_s(int m, int d = 1) { return pi * m / d; } + constexpr static T recip_pi_s(int m, int d = 1) { return recip_pi * m / d; } + + constexpr static T pi = static_cast<T>(3.1415926535897932384626433832795); + constexpr static T sqr_pi = static_cast<T>(9.8696044010893586188344909998762); + constexpr static T recip_pi = static_cast<T>(0.31830988618379067153776752674503); + constexpr static T degtorad = static_cast<T>(pi / 180); + constexpr static T radtodeg = static_cast<T>(pi * 180); + constexpr static T e = static_cast<T>(2.718281828459045235360287471352662); + constexpr static T recip_log_2 = static_cast<T>(1.442695040888963407359924681001892137426645954); + constexpr static T recip_log_10 = static_cast<T>(0.43429448190325182765112891891661); + constexpr static T log_2 = static_cast<T>(0.69314718055994530941723212145818); + constexpr static T log_10 = static_cast<T>(2.3025850929940456840179914546844); + constexpr static T sqrt_2 = static_cast<T>(1.4142135623730950488016887242097); + + constexpr static T fold_constant_div = choose_const<T>( + CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01)); + + constexpr static T fold_constant_hi = choose_const<T>( + CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01)); + constexpr static T fold_constant_rem1 = + choose_const<T>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f), + CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08)); + constexpr static T fold_constant_rem2 = + choose_const<T>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f), + CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15)); + + constexpr static T epsilon = std::numeric_limits<T>::epsilon(); + constexpr static T infinity = std::numeric_limits<T>::infinity(); + constexpr static T neginfinity = -std::numeric_limits<T>::infinity(); + constexpr static T qnan = std::numeric_limits<T>::quiet_NaN(); +}; + +template <typename T> +struct constants : public scalar_constants<subtype<T>> +{ +public: + using Tsub = subtype<T>; +}; + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +/// π (pi) +/// c_pi<f64, 4> = 4pi +/// c_pi<f64, 3, 4> = 3/4pi +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d); + +/// π² (pi²) +/// c_sqr_pi<f64, 4> = 4pi² +/// c_sqr_pi<f64, 3, 4> = 3/4pi² +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d); + +/// 1/π (1/pi) +/// c_recip_pi<f64> 1/pi +/// c_recip_pi<f64, 4> 4/pi +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d); + +/// degree to radian conversion factor +template <typename T> +constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>; + +/// radian to degree conversion factor +template <typename T> +constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>; + +/// e, Euler's number +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d); + +template <typename T> +constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52; + +template <typename T> +constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1; + +template <typename T> +constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon()); + +/// infinity +template <typename T> +constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity(); + +/// -infinity +template <typename T> +constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity(); + +/// Quiet NaN +template <typename T> +constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN(); + +template <typename T> +constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954); + +template <typename T> +constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661); + +template <typename T> +constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818); + +template <typename T> +constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844); + +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d); +} // namespace kfr + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/simd/digitreverse.hpp b/include/kfr/simd/digitreverse.hpp @@ -0,0 +1,110 @@ +/** @addtogroup shuffle + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once +#include "shuffle.hpp" +#include "types.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace internal +{ + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative") + +constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; } + +template <u32 m, u32 shift, u32... values> +constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>) +{ + return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>()); +} + +template <size_t bits> +constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>) +{ + return bit_permute_step_impl( + x, + cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >> + (32 - bits); +} + +template <size_t bits> +constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>) +{ + return bit_permute_step_impl( + x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >> + (32 - bits); +} + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +template <size_t radix, size_t bits> +struct shuffle_index_digitreverse +{ + constexpr inline size_t operator()(size_t index) const CMT_NOEXCEPT + { + return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>()); + } +}; +} // namespace internal + +template <size_t radix, size_t group = 1, typename T, size_t N> +KFR_INTRINSIC vec<T, N> digitreverse(const vec<T, N>& x) +{ + return x.shuffle(scale<group>( + csizeseq<N / group>.map(internal::shuffle_index_digitreverse<radix, ilog2(N / group)>()))); +} + +template <size_t groupsize = 1, typename T, size_t N> +KFR_INTRINSIC vec<T, N> bitreverse(const vec<T, N>& x) +{ + return digitreverse<2, groupsize>(x); +} + +template <size_t groupsize = 1, typename T, size_t N> +KFR_INTRINSIC vec<T, N> digitreverse4(const vec<T, N>& x) +{ + return digitreverse<4, groupsize>(x); +} + +template <size_t bits> +constexpr inline u32 bitreverse(u32 x) +{ + return internal::digitreverse_impl<bits>(x, csize_t<2>()); +} + +template <size_t bits> +constexpr inline u32 digitreverse4(u32 x) +{ + return internal::digitreverse_impl<bits>(x, csize_t<4>()); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/horizontal.hpp b/include/kfr/simd/horizontal.hpp @@ -0,0 +1,138 @@ +/** @addtogroup horizontal + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, typename ReduceFn> +KFR_INTRINSIC T horizontal_impl(const vec<T, 1>& value, ReduceFn&&) +{ + return T(value.front()); +} + +template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))> +KFR_INTRINSIC T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce) +{ + return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce)); +} +template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))> +KFR_INTRINSIC T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce) +{ + const T initial = reduce(initialvalue<T>()); + return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce)); +} +} // namespace intrinsics + +template <typename T, size_t N, typename ReduceFn> +KFR_INTRINSIC T horizontal(const vec<T, N>& value, ReduceFn&& reduce) +{ + return intrinsics::horizontal_impl(value, std::forward<ReduceFn>(reduce)); +} + +/// @brief Sum all elements of the vector +template <typename T, size_t N> +KFR_INTRINSIC T hadd(const vec<T, N>& value) +{ + return horizontal(value, fn::add()); +} +KFR_FN(hadd) + +/// @brief Sum all elements of the vector +template <typename T, size_t N> +KFR_INTRINSIC T hsum(const vec<T, N>& value) +{ + return horizontal(value, fn::add()); +} +KFR_FN(hsum) + +/// @brief Multiply all elements of the vector +template <typename T, size_t N> +KFR_INTRINSIC T hmul(const vec<T, N>& value) +{ + return horizontal(value, fn::mul()); +} +KFR_FN(hmul) + +/// @brief Multiply all elements of the vector +template <typename T, size_t N> +KFR_INTRINSIC T hproduct(const vec<T, N>& value) +{ + return horizontal(value, fn::mul()); +} +KFR_FN(hproduct) + +template <typename T, size_t N> +KFR_INTRINSIC T hbitwiseand(const vec<T, N>& value) +{ + return horizontal(value, fn::bitwiseand()); +} +KFR_FN(hbitwiseand) +template <typename T, size_t N> +KFR_INTRINSIC T hbitwiseor(const vec<T, N>& value) +{ + return horizontal(value, fn::bitwiseor()); +} +KFR_FN(hbitwiseor) +template <typename T, size_t N> +KFR_INTRINSIC T hbitwisexor(const vec<T, N>& value) +{ + return horizontal(value, fn::bitwisexor()); +} +KFR_FN(hbitwisexor) + +/// @brief Calculate the Dot-Product of two vectors +template <typename T, size_t N> +KFR_INTRINSIC T hdot(const vec<T, N>& x, const vec<T, N>& y) +{ + return hadd(x * y); +} +KFR_FN(hdot) + +/// @brief Calculate the Arithmetic mean of all elements in the vector +template <typename T, size_t N> +KFR_INTRINSIC T havg(const vec<T, N>& value) +{ + return hadd(value) / N; +} +KFR_FN(havg) + +/// @brief Calculate the RMS of all elements in the vector +template <typename T, size_t N> +KFR_INTRINSIC T hrms(const vec<T, N>& value) +{ + return builtin_sqrt(hadd(value * value) / N); +} +KFR_FN(hrms) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/backend.hpp b/include/kfr/simd/impl/backend.hpp @@ -0,0 +1,79 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "simd.hpp" +#ifdef CMT_CLANG_EXT +#include "backend_clang.hpp" +#else +#include "backend_generic.hpp" +#endif + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#ifdef KFR_AUTOTESTS +template <typename T> +struct check_sizes +{ + static_assert(sizeof(simd<T, 1>) == sizeof(T), ""); + static_assert(sizeof(simd<T, 2>) == sizeof(T) * 2, ""); + static_assert(sizeof(simd<T, 3>) == sizeof(T) * 4, ""); + static_assert(sizeof(simd<T, 4>) == sizeof(T) * 4, ""); + static_assert(sizeof(simd<T, 5>) == sizeof(T) * 8, ""); + static_assert(sizeof(simd<T, 6>) == sizeof(T) * 8, ""); + static_assert(sizeof(simd<T, 7>) == sizeof(T) * 8, ""); + static_assert(sizeof(simd<T, 8>) == sizeof(T) * 8, ""); + static_assert(sizeof(simd<T, 16>) == sizeof(T) * 16, ""); + static_assert(sizeof(simd<T, 32>) == sizeof(T) * 32, ""); + static_assert(sizeof(simd<T, 64>) == sizeof(T) * 64, ""); + static_assert(sizeof(simd<T, 128>) == sizeof(T) * 128, ""); + static_assert(sizeof(simd<T, 256>) == sizeof(T) * 256, ""); + static_assert(sizeof(simd<T, 512>) == sizeof(T) * 512, ""); + static_assert(sizeof(simd<T, 513>) == sizeof(T) * 1024, ""); + static_assert(sizeof(simd<T, 1023>) == sizeof(T) * 1024, ""); + static_assert(sizeof(simd<T, 1024>) == sizeof(T) * 1024, ""); +}; + +template struct check_sizes<float>; +template struct check_sizes<double>; +template struct check_sizes<uint8_t>; +template struct check_sizes<uint16_t>; +template struct check_sizes<uint32_t>; +template struct check_sizes<uint64_t>; +template struct check_sizes<int8_t>; +template struct check_sizes<int16_t>; +template struct check_sizes<int32_t>; +template struct check_sizes<int64_t>; + +#endif +} // namespace intrinsics +} // namespace CMT_ARCH_NAME + +using CMT_ARCH_NAME::intrinsics::simd; +} // namespace kfr diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp @@ -0,0 +1,228 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "simd.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename TT, size_t NN> +using simd = TT __attribute__((ext_vector_type(NN))); + +template <typename T, size_t N1> +KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x); + +template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)> +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, Ns>&... z); + +template <typename Tout> +KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete; + +template <typename Tout, typename Arg> +KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) +{ + return (simd<Tout, 1>){ static_cast<Tout>(arg) }; +} + +template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)> +KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args) +{ + return (simd<Tout, N>){ static_cast<Tout>(args)... }; +} + +/// @brief Returns vector with undefined value +template <typename Tout, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_undefined() +{ + simd<Tout, N> x; + return x; +} + +/// @brief Returns vector with all zeros +template <typename Tout, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_zeros() +{ + return Tout(); +} + +/// @brief Returns vector with all ones +template <typename Tout, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_allones() +{ + return special_constants<Tout>::allones(); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout))> +KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) +{ + return (simd<Tout, Nout>)x; +} + +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x) +{ + return x; +} + +template <typename T, size_t N, size_t index> +KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) +{ + return value[index]; +} + +template <typename T, size_t N, size_t index> +KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) +{ + value[index] = x; + return value; +} + +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) +{ + return value; +} + +template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>, + overload_generic) +{ + return __builtin_shufflevector(x, x, (indices > N ? -1 : static_cast<int>(indices))...); +} + +template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y, + csizes_t<indices...>, overload_generic) +{ + static_assert(N == N2, ""); + return __builtin_shufflevector(x, y, (indices > 2 * N ? -1 : static_cast<int>(indices))...); +} + +template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2), + size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y, + csizes_t<indices...>, overload_generic) +{ + constexpr size_t Nmax = (N1 > N2 ? N1 : N2); + return simd_shuffle( + simd2_t<T, Nmax, Nmax>{}, simd_shuffle(simd_t<T, N1>{}, x, csizeseq<Nmax>, overload_auto), + simd_shuffle(simd_t<T, N2>{}, y, csizeseq<Nmax>, overload_auto), + csizes<(indices < N1 ? indices : indices < N1 + N2 ? indices + (Nmax - N1) : index_undefined)...>, + overload_auto); +} + +template <typename T, size_t N1> +KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x) +{ + return x; +} + +template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/> +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, Ns>&... z) +{ + return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...), + csizeseq<N1 + N2 + Nscount>, overload_auto); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename Tout, typename Tin, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) +{ + return __builtin_convertvector(x, simd<Tout, N>); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) +{ + return x; +} + +template <typename T, size_t N, bool A> +using simd_storage = struct_with_alignment<simd<T, N>, A>; + +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +KFR_INTRINSIC simd<T, N> simd_read(const T* src) +{ + return ptr_cast<simd_storage<T, N, A>>(src)->value; +} + +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +KFR_INTRINSIC simd<T, N> simd_read(const T* src) +{ + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>); + constexpr auto concat_indices = cvalseq_t<size_t, N>(); + return simd_shuffle( + simd2_t<T, first, first>{}, simd_read<first, A>(src), + simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto), + concat_indices, overload_auto); +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) +{ + ptr_cast<simd_storage<T, N, A>>(dest)->value = value; +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) +{ + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq<first>, overload_auto)); + simd_write<false, rest>(dest + first, + simd_shuffle(simd_t<T, N>{}, value, csizeseq<rest, first>, overload_auto)); +} + +template <typename T, size_t N> +KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) +{ + return value[index]; +} + +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, size_t index, T x) +{ + value[index] = x; + return value; +} +} // namespace intrinsics +} // namespace CMT_ARCH_NAME + +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp @@ -0,0 +1,1080 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "simd.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wuninitialized") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunknown-warning-option") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized") + +namespace kfr +{ + +#if KFR_SHOW_NOT_OPTIMIZED +CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT; +#else +#define not_optimized(...) \ + do \ + { \ + } while (0) +#endif + +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N> +using simd = typename simd_type<T, N>::type; + +template <typename T, size_t N, typename U> +union simd_small_array { + static_assert(sizeof(T) * N == sizeof(U), ""); + T arr[N]; + U whole; + + KFR_INTRINSIC static constexpr simd_small_array from(U whole) + { + union { + const U w; + simd_small_array r; + } u{ whole }; + return u.r; + } +}; + +#define KFR_SIMD_TYPE(T, N, ...) \ + template <> \ + struct simd_type<T, N> \ + { \ + using type = __VA_ARGS__; \ + }; + +#define KFR_SIMD_SMALL_TYPE(T, N, U) \ + template <> \ + struct simd_type<T, N> \ + { \ + using type = simd_small_array<T, N, U>; \ + }; + +template <typename T> +struct simd_type<T, 1> +{ + using type = T; +}; + +template <typename T, size_t N> +struct simd_type +{ + using type = simd_halves<T, N>; +}; + +KFR_SIMD_SMALL_TYPE(u8, 2, u16) +KFR_SIMD_SMALL_TYPE(i8, 2, u16) + +KFR_SIMD_SMALL_TYPE(u8, 4, u32) +KFR_SIMD_SMALL_TYPE(u16, 2, u32) +KFR_SIMD_SMALL_TYPE(i8, 4, u32) +KFR_SIMD_SMALL_TYPE(i16, 2, u32) + +KFR_SIMD_SMALL_TYPE(u8, 8, u64) +KFR_SIMD_SMALL_TYPE(u16, 4, u64) +KFR_SIMD_SMALL_TYPE(u32, 2, u64) +KFR_SIMD_SMALL_TYPE(i8, 8, u64) +KFR_SIMD_SMALL_TYPE(i16, 4, u64) +KFR_SIMD_SMALL_TYPE(i32, 2, u64) + +KFR_SIMD_SMALL_TYPE(f32, 2, f64) + +#ifdef CMT_ARCH_SSE +KFR_SIMD_TYPE(f32, 4, __m128) +KFR_SIMD_TYPE(f64, 2, __m128d) +#endif // CMT_ARCH_SSE + +#ifdef CMT_ARCH_SSE2 +KFR_SIMD_TYPE(u8, 16, __m128i) +KFR_SIMD_TYPE(u16, 8, __m128i) +KFR_SIMD_TYPE(u32, 4, __m128i) +KFR_SIMD_TYPE(u64, 2, __m128i) +KFR_SIMD_TYPE(i8, 16, __m128i) +KFR_SIMD_TYPE(i16, 8, __m128i) +KFR_SIMD_TYPE(i32, 4, __m128i) +KFR_SIMD_TYPE(i64, 2, __m128i) +#endif // CMT_ARCH_SSE2 + +#ifdef CMT_ARCH_AVX +KFR_SIMD_TYPE(float, 8, __m256) +KFR_SIMD_TYPE(double, 4, __m256d) +KFR_SIMD_TYPE(u8, 32, __m256i) +KFR_SIMD_TYPE(u16, 16, __m256i) +KFR_SIMD_TYPE(u32, 8, __m256i) +KFR_SIMD_TYPE(u64, 4, __m256i) +KFR_SIMD_TYPE(i8, 32, __m256i) +KFR_SIMD_TYPE(i16, 16, __m256i) +KFR_SIMD_TYPE(i32, 8, __m256i) +KFR_SIMD_TYPE(i64, 4, __m256i) +#endif // CMT_ARCH_AVX + +#ifdef CMT_ARCH_AVX512 +KFR_SIMD_TYPE(float, 16, __m512) +KFR_SIMD_TYPE(double, 8, __m512d) +KFR_SIMD_TYPE(u8, 64, __m512i) +KFR_SIMD_TYPE(u16, 32, __m512i) +KFR_SIMD_TYPE(u32, 16, __m512i) +KFR_SIMD_TYPE(u64, 8, __m512i) +KFR_SIMD_TYPE(i8, 64, __m512i) +KFR_SIMD_TYPE(i16, 32, __m512i) +KFR_SIMD_TYPE(i32, 16, __m512i) +KFR_SIMD_TYPE(i64, 8, __m512i) +#endif // CMT_ARCH_AVX512 + +#ifdef CMT_ARCH_NEON +KFR_SIMD_TYPE(u8, 16, uint8x16_t); +KFR_SIMD_TYPE(u16, 8, uint16x8_t); +KFR_SIMD_TYPE(u32, 4, uint32x4_t); +KFR_SIMD_TYPE(u64, 2, uint64x2_t); +KFR_SIMD_TYPE(i8, 16, int8x16_t); +KFR_SIMD_TYPE(i16, 8, int16x8_t); +KFR_SIMD_TYPE(i32, 4, int32x4_t); +KFR_SIMD_TYPE(i64, 2, int64x2_t); +KFR_SIMD_TYPE(f32, 4, float32x4_t); +#ifdef CMT_ARCH_NEON64 +KFR_SIMD_TYPE(f64, 2, float64x2_t); +#endif // CMT_ARCH_NEON64 +#endif // CMT_ARCH_NEON + +#if defined CMT_COMPILER_MSVC +#define KFR_i8sse_INDEX(x, i) x.m128i_i8[i] +#define KFR_i16sse_INDEX(x, i) x.m128i_i16[i] +#define KFR_i32sse_INDEX(x, i) x.m128i_i32[i] +#define KFR_i64sse_INDEX(x, i) x.m128i_i64[i] +#define KFR_u8sse_INDEX(x, i) x.m128i_u8[i] +#define KFR_u16sse_INDEX(x, i) x.m128i_u16[i] +#define KFR_u32sse_INDEX(x, i) x.m128i_u32[i] +#define KFR_u64sse_INDEX(x, i) x.m128i_u64[i] +#define KFR_f32sse_INDEX(x, i) x.m128_f32[i] +#define KFR_f64sse_INDEX(x, i) x.m128d_f64[i] +#else +#define KFR_i8sse_INDEX(x, i) bitcast_anything<simd_array<i8, 16>>(x).val[i] +#define KFR_i16sse_INDEX(x, i) bitcast_anything<simd_array<i16, 8>>(x).val[i] +#define KFR_i32sse_INDEX(x, i) _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 1, i))) +#define KFR_i64sse_INDEX(x, i) _mm_cvtsi128_si64(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, (i)*2 + 1, i * 2))) +#define KFR_u8sse_INDEX(x, i) bitcast_anything<simd_array<u8, 16>>(x).val[i] +#define KFR_u16sse_INDEX(x, i) bitcast_anything<simd_array<u16, 8>>(x).val[i] +#define KFR_u32sse_INDEX(x, i) _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 1, i))) +#define KFR_u64sse_INDEX(x, i) _mm_cvtsi128_si64(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, (i)*2 + 1, i * 2))) +#define KFR_f32sse_INDEX(x, i) _mm_cvtss_f32(_mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 2, 1, i))) +#define KFR_f64sse_INDEX(x, i) _mm_cvtsd_f64(_mm_shuffle_pd(x, x, _MM_SHUFFLE2(1, i))) +#endif + +// specializations + +#ifdef KFR_NATIVE_INTRINSICS + +#define KFR_GEN_ty(n, ty) ty(n) +#define KFR_GEN_arg_def(n, ty) ty arg##n +#define KFR_GEN_arg(n, ty) arg##n + +#define KFR_INTRIN_MAKE(n, ty, intrin) \ + KFR_INTRINSIC simd<ty, n> simd_make(ctype_t<ty>, CMT_GEN_LIST(n, KFR_GEN_arg_def, ty)) CMT_NOEXCEPT \ + { \ + return intrin(CMT_GEN_LIST(n, KFR_GEN_arg, ty)); \ + } + +#ifdef CMT_ARCH_SSE2 +inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); } +KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x) +KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x) +KFR_INTRIN_MAKE(2, f64, _mm_setr_pd) +KFR_INTRIN_MAKE(4, i32, _mm_setr_epi32) +KFR_INTRIN_MAKE(4, u32, _mm_setr_epi32) +KFR_INTRIN_MAKE(4, f32, _mm_setr_ps) +KFR_INTRIN_MAKE(8, i16, _mm_setr_epi16) +KFR_INTRIN_MAKE(8, u16, _mm_setr_epi16) +KFR_INTRIN_MAKE(16, i8, _mm_setr_epi8) +KFR_INTRIN_MAKE(16, u8, _mm_setr_epi8) + +#define KFR_INTRIN_BITCAST(Tout, Tin, N, ...) \ + KFR_INTRINSIC simd<Tout, N> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } +KFR_INTRIN_BITCAST(f32, i32, 4, _mm_castsi128_ps(x)) +KFR_INTRIN_BITCAST(i32, f32, 4, _mm_castps_si128(x)) +KFR_INTRIN_BITCAST(f64, i64, 2, _mm_castsi128_pd(x)) +KFR_INTRIN_BITCAST(i64, f64, 2, _mm_castpd_si128(x)) + +#define KFR_INTRIN_BROADCAST(T, N, ...) \ + KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, T value) CMT_NOEXCEPT { return __VA_ARGS__; } + +KFR_INTRIN_BROADCAST(i8, 16, _mm_set1_epi8(value)) +KFR_INTRIN_BROADCAST(i16, 8, _mm_set1_epi16(value)) +KFR_INTRIN_BROADCAST(i32, 4, _mm_set1_epi32(value)) +KFR_INTRIN_BROADCAST(i64, 2, _mm_set1_epi64x(value)) +KFR_INTRIN_BROADCAST(u8, 16, _mm_set1_epi8(value)) +KFR_INTRIN_BROADCAST(u16, 8, _mm_set1_epi16(value)) +KFR_INTRIN_BROADCAST(u32, 4, _mm_set1_epi32(value)) +KFR_INTRIN_BROADCAST(u64, 2, _mm_set1_epi64x(value)) +KFR_INTRIN_BROADCAST(f32, 4, _mm_set1_ps(value)) +KFR_INTRIN_BROADCAST(f64, 2, _mm_set1_pd(value)) + +#define KFR_INTRIN_SHUFFLE_SWAP(T, N, ...) \ + KFR_INTRINSIC simd<T, N> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N> ^ csize<1>, \ + overload_priority<9>) CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } + +#define KFR_INTRIN_SHUFFLE_LINEAR(T, Nout, Nin, ...) \ + KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizeseq_t<Nout>, \ + overload_priority<9>) CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } +#define KFR_INTRIN_SHUFFLE_LINEAR_START(T, Nout, Nin, Nstart, ...) \ + KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, \ + csizeseq_t<Nout, Nstart>, overload_priority<9>) CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } + +#define KFR_INTRIN_SHUFFLE_CONCAT(T, Nin, ...) \ + KFR_INTRINSIC simd<T, Nin + Nin> simd_shuffle(simd2_t<T, Nin, Nin>, const simd<T, Nin>& x, \ + const simd<T, Nin>& y, csizeseq_t<Nin + Nin>, \ + overload_priority<9>) CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } + +// extend +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 1, _mm_cvtsi32_si128(u8(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 1, _mm_cvtsi32_si128(u16(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 1, _mm_cvtsi64_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 1, _mm_cvtsi64_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 1, _mm_set_ss(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 1, _mm_set_sd(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 4, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 4, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 8, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 8, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 4, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 4, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 2, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 2, _mm_cvtsi64_si128(x.whole)) + +// slice +KFR_INTRIN_SHUFFLE_LINEAR(i32, 1, 4, _mm_cvtsi128_si32(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 1, 4, _mm_cvtsi128_si32(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 4, 16, simd<i8, 4>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 8, 16, simd<i8, 8>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 2, 16, simd<u8, 2>::from(u16(_mm_cvtsi128_si32(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 4, 16, simd<u8, 4>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 8, 16, simd<u8, 8>::from(_mm_cvtsi128_si64(x))) + +KFR_INTRIN_SHUFFLE_LINEAR(i16, 2, 8, simd<i16, 2>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 4, 8, simd<i16, 4>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 2, 8, simd<u16, 2>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 4, 8, simd<u16, 4>::from(_mm_cvtsi128_si64(x))) + +KFR_INTRIN_SHUFFLE_LINEAR(i32, 2, 4, simd<i32, 2>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 2, 4, simd<u32, 2>::from(_mm_cvtsi128_si64(x))) + +// high +KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 8, 16, 8, simd<u8, 8>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 8, 16, 8, simd<i8, 8>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 4, 8, 4, simd<u16, 4>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1))) + +#define KFR_INTRIN_CONVERT(Tout, Tin, N, ...) \ + KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } + +KFR_INTRIN_CONVERT(f32, i32, 4, _mm_cvtepi32_ps(x)) +KFR_INTRIN_CONVERT(i32, f32, 4, _mm_cvttps_epi32(x)) +KFR_INTRIN_CONVERT(i32, f64, 2, simd<i32, 2>::from(_mm_cvtsi128_si64(_mm_cvttpd_epi32(x)))) +KFR_INTRIN_CONVERT(f64, i32, 2, _mm_cvtepi32_pd(KFR_mm_setr_epi64x(x.whole, 0))) +KFR_INTRIN_CONVERT(i64, f64, 2, _mm_set_epi64x(_mm_cvttsd_si64(_mm_unpackhi_pd(x, x)), _mm_cvttsd_si64(x))) +KFR_INTRIN_CONVERT(f64, i64, 2, + _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), _mm_cvtsi128_si64(x)), + _mm_cvtsi64_sd(_mm_setzero_pd(), KFR_i64sse_INDEX(x, 1)))) +#ifdef CMT_ARCH_AVX +KFR_INTRIN_CONVERT(f64, f32, 4, _mm256_cvtps_pd(x)) +#else +KFR_INTRIN_CONVERT(f64, f32, 4, + simd<f64, 4>{ _mm_cvtps_pd(x), + _mm_cvtps_pd(_mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2))) }) +#endif +#ifdef CMT_ARCH_AVX +KFR_INTRIN_CONVERT(f32, f64, 4, _mm256_cvtpd_ps(x)) +#else +KFR_INTRIN_CONVERT(f32, f64, 4, + simd<f32, 4>{ _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtpd_ps(x.low)), + _mm_castps_pd(_mm_cvtpd_ps(x.high)))) }) +#endif +#endif // CMT_ARCH_SSE2 + +#ifdef CMT_ARCH_SSE41 + +KFR_INTRIN_CONVERT(i16, i8, 8, _mm_cvtepi8_epi16(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(u16, u8, 8, _mm_cvtepu8_epi16(_mm_cvtsi64_si128(x.whole))) + +KFR_INTRIN_CONVERT(i32, i16, 4, _mm_cvtepi16_epi32(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(u32, u16, 4, _mm_cvtepu16_epi32(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(i32, i8, 4, _mm_cvtepi8_epi32(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_CONVERT(u32, u8, 4, _mm_cvtepu8_epi32(_mm_cvtsi32_si128(x.whole))) + +KFR_INTRIN_CONVERT(i64, i32, 2, _mm_cvtepi32_epi64(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(u64, u32, 2, _mm_cvtepu32_epi64(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(i64, i16, 2, _mm_cvtepi16_epi64(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_CONVERT(u64, u16, 2, _mm_cvtepu16_epi64(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_CONVERT(i64, i8, 2, _mm_cvtepi8_epi64(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_CONVERT(u64, u8, 2, _mm_cvtepu8_epi64(_mm_cvtsi32_si128(x.whole))) + +KFR_INTRIN_CONVERT(f32, i8, 4, _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(x.whole)))) +KFR_INTRIN_CONVERT(f32, i16, 4, _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_cvtsi64_si128(x.whole)))) +KFR_INTRIN_CONVERT(f32, u8, 4, _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(x.whole)))) +KFR_INTRIN_CONVERT(f32, u16, 4, _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_cvtsi64_si128(x.whole)))) + +#ifndef CMT_ARCH_AVX +KFR_INTRIN_CONVERT(i64, i32, 4, + simd<i64, 4>{ _mm_cvtepi32_epi64(x), + _mm_cvtepi32_epi64(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2))) }) +#endif +#endif + +#ifdef CMT_ARCH_AVX +KFR_INTRIN_MAKE(4, f64, _mm256_setr_pd) +KFR_INTRIN_MAKE(8, f32, _mm256_setr_ps) + +KFR_INTRIN_BITCAST(f32, i32, 8, _mm256_castsi256_ps(x)) + +KFR_INTRIN_BITCAST(i32, f32, 8, _mm256_castps_si256(x)) +KFR_INTRIN_BITCAST(f64, i64, 4, _mm256_castsi256_pd(x)) +KFR_INTRIN_BITCAST(i64, f64, 4, _mm256_castpd_si256(x)) + +KFR_INTRINSIC __m256 KFR_mm256_setr_m128(__m128 x, __m128 y) +{ + return _mm256_insertf128_ps(_mm256_castps128_ps256(x), y, 1); +} + +KFR_INTRINSIC __m256d KFR_mm256_setr_m128d(__m128d x, __m128d y) +{ + return _mm256_insertf128_pd(_mm256_castpd128_pd256(x), y, 1); +} +KFR_INTRINSIC __m256i KFR_mm256_setr_m128i(__m128i x, __m128i y) +{ +#ifdef CMT_ARCH_AVX2 + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), y, 1); +#else + return _mm256_castps_si256( + _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(x)), _mm_castsi128_ps(y), 1)); +#endif +} + +KFR_INTRIN_SHUFFLE_CONCAT(f32, 4, KFR_mm256_setr_m128(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(f64, 2, KFR_mm256_setr_m128d(x, y)) + +// concat +KFR_INTRIN_SHUFFLE_CONCAT(i8, 16, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i16, 8, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i32, 4, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i64, 2, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u8, 16, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u16, 8, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u32, 4, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u64, 2, KFR_mm256_setr_m128i(x, y)) +// low +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 8, _mm256_castps256_ps128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 4, _mm256_castpd256_pd128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 32, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 16, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 8, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 4, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 32, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 16, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 8, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 4, _mm256_castsi256_si128(x)) + +// extend +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 4, _mm256_castps128_ps256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 2, _mm256_castpd128_pd256(x)) + +// high +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4, 8, 4, _mm256_extractf128_ps(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2, 4, 2, _mm256_extractf128_pd(x, 1)) + +KFR_INTRIN_BROADCAST(f32, 8, _mm256_set1_ps(value)) +KFR_INTRIN_BROADCAST(f64, 4, _mm256_set1_pd(value)) + +KFR_INTRIN_SHUFFLE_LINEAR(f32, 8, 1, _mm256_castps128_ps256(_mm_set_ss(x))) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 4, 1, _mm256_castpd128_pd256(_mm_set_sd(x))) +#endif // CMT_ARCH_AVX + +#ifdef CMT_ARCH_AVX2 +KFR_INTRIN_MAKE(4, i64, _mm256_setr_epi64x) +KFR_INTRIN_MAKE(4, u64, _mm256_setr_epi64x) +KFR_INTRIN_MAKE(8, i32, _mm256_setr_epi32) +KFR_INTRIN_MAKE(8, u32, _mm256_setr_epi32) +KFR_INTRIN_MAKE(16, i16, _mm256_setr_epi16) +KFR_INTRIN_MAKE(16, u16, _mm256_setr_epi16) +KFR_INTRIN_MAKE(32, i8, _mm256_setr_epi8) +KFR_INTRIN_MAKE(32, u8, _mm256_setr_epi8) + +KFR_INTRIN_CONVERT(i16, i8, 16, _mm256_cvtepi8_epi16(x)) +KFR_INTRIN_CONVERT(u16, u8, 16, _mm256_cvtepu8_epi16(x)) + +KFR_INTRIN_CONVERT(i32, i16, 8, _mm256_cvtepi16_epi32(x)) +KFR_INTRIN_CONVERT(u32, u16, 8, _mm256_cvtepu16_epi32(x)) +KFR_INTRIN_CONVERT(i32, i8, 8, _mm256_cvtepi8_epi32(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(u32, u8, 8, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole))) + +KFR_INTRIN_CONVERT(i64, i32, 4, _mm256_cvtepi32_epi64(x)) +KFR_INTRIN_CONVERT(u64, u32, 4, _mm256_cvtepu32_epi64(x)) +KFR_INTRIN_CONVERT(i64, i16, 4, _mm256_cvtepi16_epi64(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(u64, u16, 4, _mm256_cvtepu16_epi64(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_CONVERT(i64, i8, 4, _mm256_cvtepi8_epi64(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_CONVERT(u64, u8, 4, _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(x.whole))) + +KFR_INTRIN_CONVERT(f32, i8, 8, _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(x.whole)))) +KFR_INTRIN_CONVERT(f32, i16, 8, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(x))) +KFR_INTRIN_CONVERT(f32, u8, 8, _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole)))) +KFR_INTRIN_CONVERT(f32, u16, 8, _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(x))) + +KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 16, 32, 16, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 8, 16, 8, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4, 8, 4, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2, 4, 2, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 16, 32, 16, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 8, 16, 8, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 4, 8, 4, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u64, 2, 4, 2, _mm256_extracti128_si256(x, 1)) + +KFR_INTRIN_BROADCAST(i8, 32, _mm256_set1_epi8(value)) +KFR_INTRIN_BROADCAST(i16, 16, _mm256_set1_epi16(value)) +KFR_INTRIN_BROADCAST(i32, 8, _mm256_set1_epi32(value)) +KFR_INTRIN_BROADCAST(i64, 4, _mm256_set1_epi64x(value)) +KFR_INTRIN_BROADCAST(u8, 32, _mm256_set1_epi8(value)) +KFR_INTRIN_BROADCAST(u16, 16, _mm256_set1_epi16(value)) +KFR_INTRIN_BROADCAST(u32, 8, _mm256_set1_epi32(value)) +KFR_INTRIN_BROADCAST(u64, 4, _mm256_set1_epi64x(value)) + +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 16, _mm256_castsi128_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 8, _mm256_castsi128_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 4, _mm256_castsi128_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 2, _mm256_castsi128_si256(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u8(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u16(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) + +KFR_INTRIN_CONVERT(i32, f32, 8, _mm256_cvttps_epi32(x)) +KFR_INTRIN_CONVERT(f32, i32, 8, _mm256_cvtepi32_ps(x)) +KFR_INTRIN_CONVERT(f64, i32, 4, _mm256_cvtepi32_pd(x)) +KFR_INTRIN_CONVERT(i32, f64, 4, _mm256_cvttpd_epi32(x)) +#endif // CMT_ARCH_AVX2 + +#ifdef CMT_ARCH_AVX512 + +static inline __m512d KFR_mm512_setr_pd(f64 x0, f64 x1, f64 x2, f64 x3, f64 x4, f64 x5, f64 x6, f64 x7) +{ + return _mm512_set_pd(x7, x6, x5, x4, x3, x2, x1, x0); +} +static inline __m512 KFR_mm512_setr_ps(f32 x0, f32 x1, f32 x2, f32 x3, f32 x4, f32 x5, f32 x6, f32 x7, f32 x8, + f32 x9, f32 x10, f32 x11, f32 x12, f32 x13, f32 x14, f32 x15) +{ + return _mm512_set_ps(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0); +} +static inline __m512i KFR_mm512_setr_epi64(i64 x0, i64 x1, i64 x2, i64 x3, i64 x4, i64 x5, i64 x6, i64 x7) +{ + return _mm512_set_epi64(x7, x6, x5, x4, x3, x2, x1, x0); +} +static inline __m512i KFR_mm512_setr_epi32(i32 x0, i32 x1, i32 x2, i32 x3, i32 x4, i32 x5, i32 x6, i32 x7, + i32 x8, i32 x9, i32 x10, i32 x11, i32 x12, i32 x13, i32 x14, + i32 x15) +{ + return _mm512_set_epi32(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0); +} +static inline __m512i KFR_mm512_setr_epi16(i16 x0, i16 x1, i16 x2, i16 x3, i16 x4, i16 x5, i16 x6, i16 x7, + i16 x8, i16 x9, i16 x10, i16 x11, i16 x12, i16 x13, i16 x14, + i16 x15, i16 x16, i16 x17, i16 x18, i16 x19, i16 x20, i16 x21, + i16 x22, i16 x23, i16 x24, i16 x25, i16 x26, i16 x27, i16 x28, + i16 x29, i16 x30, i16 x31) +{ + return _mm512_set_epi16(x31, x30, x29, x28, x27, x26, x25, x24, x23, x22, x21, x20, x19, x18, x17, x16, + x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0); +} +static inline __m512i KFR_mm512_setr_epi8(i8 x0, i8 x1, i8 x2, i8 x3, i8 x4, i8 x5, i8 x6, i8 x7, i8 x8, + i8 x9, i8 x10, i8 x11, i8 x12, i8 x13, i8 x14, i8 x15, i8 x16, + i8 x17, i8 x18, i8 x19, i8 x20, i8 x21, i8 x22, i8 x23, i8 x24, + i8 x25, i8 x26, i8 x27, i8 x28, i8 x29, i8 x30, i8 x31, i8 x32, + i8 x33, i8 x34, i8 x35, i8 x36, i8 x37, i8 x38, i8 x39, i8 x40, + i8 x41, i8 x42, i8 x43, i8 x44, i8 x45, i8 x46, i8 x47, i8 x48, + i8 x49, i8 x50, i8 x51, i8 x52, i8 x53, i8 x54, i8 x55, i8 x56, + i8 x57, i8 x58, i8 x59, i8 x60, i8 x61, i8 x62, i8 x63) +{ + return _mm512_set_epi8(x63, x62, x61, x60, x59, x58, x57, x56, x55, x54, x53, x52, x51, x50, x49, x48, + x47, x46, x45, x44, x43, x42, x41, x40, x39, x38, x37, x36, x35, x34, x33, x32, + x31, x30, x29, x28, x27, x26, x25, x24, x23, x22, x21, x20, x19, x18, x17, x16, + x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0); +} + +KFR_INTRINSIC __m512 KFR_mm512_setr_m256(__m256 x, __m256 y) +{ + return _mm512_insertf32x8(_mm512_castps256_ps512(x), y, 1); +} + +KFR_INTRINSIC __m512d KFR_mm512_setr_m256d(__m256d x, __m256d y) +{ + return _mm512_insertf64x4(_mm512_castpd256_pd512(x), y, 1); +} +KFR_INTRINSIC __m512i KFR_mm512_setr_m256i(__m256i x, __m256i y) +{ + return _mm512_inserti32x8(_mm512_castsi256_si512(x), y, 1); +} + +KFR_INTRIN_MAKE(8, f64, KFR_mm512_setr_pd) +KFR_INTRIN_MAKE(16, f32, KFR_mm512_setr_ps) + +KFR_INTRIN_MAKE(8, i64, KFR_mm512_setr_epi64) +KFR_INTRIN_MAKE(8, u64, KFR_mm512_setr_epi64) +KFR_INTRIN_MAKE(16, i32, KFR_mm512_setr_epi32) +KFR_INTRIN_MAKE(16, u32, KFR_mm512_setr_epi32) +KFR_INTRIN_MAKE(32, i16, KFR_mm512_setr_epi16) +KFR_INTRIN_MAKE(32, u16, KFR_mm512_setr_epi16) +KFR_INTRIN_MAKE(64, i8, KFR_mm512_setr_epi8) +KFR_INTRIN_MAKE(64, u8, KFR_mm512_setr_epi8) + +KFR_INTRIN_BROADCAST(f32, 16, _mm512_set1_ps(value)) +KFR_INTRIN_BROADCAST(f64, 8, _mm512_set1_pd(value)) + +KFR_INTRIN_BROADCAST(i8, 64, _mm512_set1_epi8(value)) +KFR_INTRIN_BROADCAST(i16, 32, _mm512_set1_epi16(value)) +KFR_INTRIN_BROADCAST(i32, 16, _mm512_set1_epi32(value)) +KFR_INTRIN_BROADCAST(i64, 8, _mm512_set1_epi64(value)) +KFR_INTRIN_BROADCAST(u8, 64, _mm512_set1_epi8(value)) +KFR_INTRIN_BROADCAST(u16, 32, _mm512_set1_epi16(value)) +KFR_INTRIN_BROADCAST(u32, 16, _mm512_set1_epi32(value)) +KFR_INTRIN_BROADCAST(u64, 8, _mm512_set1_epi64(value)) + +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u8(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u16(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) + +KFR_INTRIN_CONVERT(i32, f32, 16, _mm512_cvttps_epi32(x)) +KFR_INTRIN_CONVERT(f32, i32, 16, _mm512_cvtepi32_ps(x)) +KFR_INTRIN_CONVERT(f64, i32, 8, _mm512_cvtepi32_pd(x)) +KFR_INTRIN_CONVERT(i32, f64, 8, _mm512_cvttpd_epi32(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 4, _mm512_castps128_ps512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2, _mm512_castpd128_pd512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 16, _mm512_castsi128_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 8, _mm512_castsi128_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 4, _mm512_castsi128_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2, _mm512_castsi128_si512(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 2 * 4, _mm512_castps256_ps512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2 * 2, _mm512_castpd256_pd512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2 * 16, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2 * 8, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2 * 4, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2 * 2, _mm512_castsi256_si512(x)) + +// low +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 8 * 2, _mm512_castps512_ps256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 4 * 2, _mm512_castpd512_pd256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x)) + +// high +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4 * 2, 8 * 2, 4 * 2, _mm512_extractf32x8_ps(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2 * 2, 4 * 2, 2 * 2, _mm512_extractf64x4_pd(x, 1)) + +KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4 * 2, 8 * 2, 4 * 2, _mm512_extracti32x8_epi32(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2 * 2, 4 * 2, 2 * 2, _mm512_extracti64x4_epi64(x, 1)) + +// concat +KFR_INTRIN_SHUFFLE_CONCAT(f32, 4 * 2, KFR_mm512_setr_m256(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(f64, 2 * 2, KFR_mm512_setr_m256d(x, y)) + +KFR_INTRIN_SHUFFLE_CONCAT(i8, 16 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i16, 8 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i32, 4 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i64, 2 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u8, 16 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u16, 8 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u32, 4 * 2, KFR_mm512_setr_m256i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u64, 2 * 2, KFR_mm512_setr_m256i(x, y)) +#endif + +#endif + +// generic functions + +template <typename T, size_t N1> +KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT; + +template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)> +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, Ns>&... z) CMT_NOEXCEPT; + +template <typename T, size_t N> +KFR_INTRINSIC simd_array<T, N> to_simd_array(const simd<T, N>& x) CMT_NOEXCEPT +{ + return bitcast_anything<simd_array<T, N>>(x); +} + +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT +{ + return bitcast_anything<simd<T, N>>(x); +} + +#define KFR_COMPONENTWISE_RET(code) \ + vec<T, N> result; \ + for (size_t i = 0; i < N; i++) \ + code; \ + return result; + +#define KFR_COMPONENTWISE_RET_I(Tvec, code) \ + Tvec result; \ + for (size_t i = 0; i < result.size(); i++) \ + code; \ + return result; + +#define KFR_COMPONENTWISE(code) \ + for (size_t i = 0; i < N; i++) \ + code; + +template <typename Tout> +KFR_INTRINSIC void simd_make(ctype_t<Tout>) CMT_NOEXCEPT = delete; + +template <typename Tout, typename Arg> +KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) CMT_NOEXCEPT +{ + return simd<Tout, 1>{ static_cast<Tout>(arg) }; +} + +template <typename T, size_t... indices, typename... Args, size_t N = sizeof...(indices)> +KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... args) CMT_NOEXCEPT; + +template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)> +KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args) CMT_NOEXCEPT +{ + constexpr size_t Nlow = prev_poweroftwo(N - 1); + return simd_concat<Tout, Nlow, N - Nlow>(simd_make_helper<Tout>(csizeseq<Nlow>, args...), + simd_make_helper<Tout>(csizeseq<N - Nlow, Nlow>, args...)); +} + +template <typename T, size_t... indices, typename... Args, size_t N> +KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... args) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + const T temp[] = { static_cast<T>(args)... }; + return simd_make(ctype<T>, temp[indices]...); +} + +/// @brief Returns vector with undefined value +template <typename Tout, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_undefined() CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + simd<Tout, N> x; + return x; +} + +/// @brief Returns vector with all zeros +template <typename Tout, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_zeros() CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + return from_simd_array<Tout, N>({ Tout() }); +} + +/// @brief Returns vector with all ones +template <typename Tout, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_allones() CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + simd_array<Tout, N> x{}; + KFR_COMPONENTWISE(x.val[i] = special_constants<Tout>::allones()); + return from_simd_array(x); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout)), + KFR_ENABLE_IF(Nout == 1 || N == 1)> +KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + return bitcast_anything<simd<Tout, Nout>>(x); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout)), + KFR_ENABLE_IF(Nout > 1 && N > 1)> +KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT +{ + constexpr size_t Nlow = prev_poweroftwo(N - 1); + return simd_concat<Tout, Nlow * Nout / N, (N - Nlow) * Nout / N>( + simd_bitcast(simd_cvt_t<Tout, Tin, Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)), + simd_bitcast(simd_cvt_t<Tout, Tin, N - Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto))); +} + +template <typename T, size_t N> +KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N, size_t index> +KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + return to_simd_array<T, N>(value).val[index]; +} + +template <typename T, size_t N, size_t index> +KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + simd_array<T, N> arr = to_simd_array<T, N>(value); + arr.val[index] = x; + return from_simd_array(arr); +} + +template <typename T, size_t N> +KFR_INTRINSIC const simd<T, N>& simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N>, + overload_priority<10>) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC const simd<T, N1>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>&, + csizeseq_t<N1>, overload_priority<9>) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC const simd<T, N2>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>&, const simd<T, N2>& y, + csizeseq_t<N2, N1>, overload_priority<9>) CMT_NOEXCEPT +{ + return y; +} + +// concat() +template <typename T, size_t N, + KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N + N>, simd_halves<T, N + N>>::value)> +KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y, + csizeseq_t<N + N>, overload_priority<8>) CMT_NOEXCEPT +{ + return simd<T, N + N>{ x, y }; +} + +template <typename T> +KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT +{ + return { value }; +} + +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)> +KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) CMT_NOEXCEPT +{ + return simd_concat<T, Nlow, N - Nlow>(simd_broadcast(simd_t<T, Nlow>{}, value), + simd_broadcast(simd_t<T, N - Nlow>{}, value)); +} + +template <typename T, size_t N, + KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N>, simd_halves<T, N>>::value)> +KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2>, + overload_priority<7>) CMT_NOEXCEPT +{ + return x.low; +} + +template <typename T, size_t N, + KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N>, simd_halves<T, N>>::value)> +KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2, N / 2>, + overload_priority<7>) CMT_NOEXCEPT +{ + return x.high; +} + +template <typename T, size_t Nout, size_t N> +simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsigned (&indices)[Nout]) +{ + simd_array<T, Nout> result; + for (size_t i = 0; i < Nout; ++i) + { + const size_t index = indices[i]; + result.val[i] = index >= N ? T() : x.val[index]; + } + return result; +} + +template <typename T, size_t Nout, size_t N1, size_t N2> +simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd_array<T, N2>& y, + const unsigned (&indices)[Nout]) +{ + simd_array<T, Nout> result; + for (size_t i = 0; i < Nout; ++i) + { + const size_t index = indices[i]; + result.val[i] = index > N1 + N2 ? T() : index >= N1 ? y.val[index - N1] : x.val[index]; + } + return result; +} + +template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>, + overload_generic) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); +#ifdef CMT_COMPILER_MSVC + const simd_array<T, N> xx = to_simd_array<T, N>(x); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array)); +#else + return from_simd_array<T, Nout>({ (indices > N ? T() : to_simd_array<T, N>(x).val[indices])... }); +#endif +} + +template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y, + csizes_t<indices...>, overload_generic) CMT_NOEXCEPT +{ + static_assert(N == N2, ""); + not_optimized(CMT_FUNC_SIGNATURE); +#ifdef CMT_COMPILER_MSVC + const simd_array<T, N> xx = to_simd_array<T, N>(x); + const simd_array<T, N> yy = to_simd_array<T, N>(y); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array)); +#else + return from_simd_array<T, Nout>( + { (indices > N * 2 ? T() + : indices >= N ? to_simd_array<T, N>(y).val[indices - N] + : to_simd_array<T, N>(x).val[indices])... }); +#endif +} + +template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2), + size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y, + csizes_t<indices...>, overload_generic) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + +#ifdef CMT_COMPILER_MSVC + const simd_array<T, N1> xx = to_simd_array<T, N1>(x); + const simd_array<T, N2> yy = to_simd_array<T, N2>(y); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N1, N2>(xx, yy, indices_array)); +#else + + return from_simd_array<T, Nout>( + { (indices > N1 + N2 ? T() + : indices >= N1 ? to_simd_array<T, N2>(y).val[indices - N1] + : to_simd_array<T, N1>(x).val[indices])... }); +#endif +} + +template <typename T, size_t N1> +KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/> +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, Ns>&... z) CMT_NOEXCEPT +{ + return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...), + csizeseq<N1 + N2 + Nscount>, overload_auto); +} + +template <typename Tout, typename Tin, size_t N, size_t... indices> +KFR_INTRINSIC simd<Tout, N> simd_convert__(const simd<Tin, N>& x, csizes_t<indices...>) CMT_NOEXCEPT +{ + const simd_array<Tin, N> xx = to_simd_array<Tin, N>(x); + return simd_make(ctype<Tout>, static_cast<Tout>(xx.val[indices])...); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename Tout, typename Tin> +KFR_INTRINSIC simd<Tout, 1> simd_convert(simd_cvt_t<Tout, Tin, 1>, const simd<Tin, 1>& x) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + return simd_make(ctype<Tout>, static_cast<Tout>(x)); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename Tout, typename Tin, size_t N> +KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT +{ + constexpr size_t Nlow = prev_poweroftwo(N - 1); + return simd_concat<Tout, Nlow, N - Nlow>( + simd_convert(simd_cvt_t<Tout, Tin, Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)), + simd_convert(simd_cvt_t<Tout, Tin, N - Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto))); +} + +/// @brief Converts input vector to vector with subtype Tout +template <typename T, size_t N> +KFR_INTRINSIC const simd<T, N>& simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT +{ + return x; +} + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-attributes") + +template <typename T, size_t N, bool A> +using simd_storage = struct_with_alignment<simd<T, N>, A>; + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT +{ + return reinterpret_cast<typename simd_storage<T, N, A>::const_pointer>(src)->value; +} + +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT +{ + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + constexpr auto extend_indices = + cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>()); + constexpr auto concat_indices = cvalseq_t<size_t, N>(); + return simd_shuffle( + simd2_t<T, first, first>{}, simd_read<first, A>(src), + simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto), + concat_indices, overload_auto); +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT +{ + reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = value; +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT +{ + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<first>(), overload_auto)); + simd_write<false, rest>(dest + first, + simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<rest, first>(), overload_auto)); +} + +template <typename T, size_t N> +KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + return to_simd_array<T, N>(value).val[index]; +} + +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, T x) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + simd_array<T, N> arr = to_simd_array<T, N>(value); + arr.val[index] = x; + return from_simd_array(arr); +} +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/simd/impl/basicoperators_clang.hpp b/include/kfr/simd/impl/basicoperators_clang.hpp @@ -0,0 +1,178 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../mask.hpp" +#include "function.hpp" +#include <algorithm> +#include <utility> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x) +{ + return -x.v; +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x) +{ + return simd_bitcast(simd_cvt_t<T, utype<T>, N>{}, ~simd_bitcast(simd_cvt_t<utype<T>, T, N>{}, x.v)); +} + +#define KFR_OP_SCALAR2(fn, op, resultprefix, operprefix, soperprefix) \ + template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& x, const T& y) \ + { \ + return resultprefix(operprefix(x.v) op soperprefix(y)); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const T& x, const vec<T, N>& y) \ + { \ + return resultprefix(soperprefix(x) op operprefix(y.v)); \ + } + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> add(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.v + y.v; +} +KFR_OP_SCALAR2(add, +, , , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> sub(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.v - y.v; +} +KFR_OP_SCALAR2(sub, -, , , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> mul(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.v * y.v; +} +KFR_OP_SCALAR2(mul, *, , , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> div(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.v / y.v; +} +KFR_OP_SCALAR2(div, /, , , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> band(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)((simd<utype<T>, N>)(x.v) & (simd<utype<T>, N>)(y.v)); +} +KFR_OP_SCALAR2(band, &, (simd<T, N>), (simd<utype<T>, N>), ubitcast) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> bor(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)((simd<utype<T>, N>)(x.v) | (simd<utype<T>, N>)(y.v)); +} +KFR_OP_SCALAR2(bor, |, (simd<T, N>), (simd<utype<T>, N>), ubitcast) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> bxor(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)((simd<utype<T>, N>)(x.v) ^ (simd<utype<T>, N>)(y.v)); +} +KFR_OP_SCALAR2(bxor, ^, (simd<T, N>), (simd<utype<T>, N>), ubitcast) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y) +{ + return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) << y.v); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y) +{ + return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) >> y.v); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, unsigned y) +{ + return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) << y); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y) +{ + return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) >> y); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)(x.v == y.v); +} +KFR_OP_SCALAR2(eq, ==, (simd<T, N>), , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)(x.v != y.v); +} +KFR_OP_SCALAR2(ne, !=, (simd<T, N>), , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)(x.v <= y.v); +} +KFR_OP_SCALAR2(le, <=, (simd<T, N>), , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)(x.v >= y.v); +} +KFR_OP_SCALAR2(ge, >=, (simd<T, N>), , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)(x.v < y.v); +} +KFR_OP_SCALAR2(lt, <, (simd<T, N>), , ) + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y) +{ + return (simd<T, N>)(x.v > y.v); +} +KFR_OP_SCALAR2(gt, >, (simd<T, N>), , ) +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/basicoperators_generic.hpp b/include/kfr/simd/impl/basicoperators_generic.hpp @@ -0,0 +1,1674 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../mask.hpp" +#include "function.hpp" +#include <algorithm> +#include <utility> + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4700)) +CMT_PRAGMA_MSVC(warning(disable : 4309)) + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC __m128 _mm_allones_ps() +{ + return _mm_castsi128_ps(_mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128())); +} + +KFR_INTRINSIC __m128d _mm_allones_pd() +{ + return _mm_castsi128_pd(_mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128())); +} + +KFR_INTRINSIC __m128i _mm_allones_si128() { return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); } + +KFR_INTRINSIC __m128 _mm_not_ps(const __m128& x) { return _mm_xor_ps(x, _mm_allones_ps()); } + +KFR_INTRINSIC __m128d _mm_not_pd(const __m128d& x) { return _mm_xor_pd(x, _mm_allones_pd()); } + +KFR_INTRINSIC __m128i _mm_not_si128(const __m128i& x) { return _mm_xor_si128(x, _mm_allones_si128()); } + +KFR_INTRINSIC __m128i _mm_highbit_epi8() { return _mm_set1_epi8(static_cast<char>(0x80)); } +KFR_INTRINSIC __m128i _mm_highbit_epi16() { return _mm_set1_epi16(static_cast<short>(0x8000)); } +KFR_INTRINSIC __m128i _mm_highbit_epi32() { return _mm_set1_epi32(static_cast<int>(0x80000000)); } +KFR_INTRINSIC __m128i _mm_highbit_epi64() { return _mm_set1_epi64x(0x8000000000000000ll); } + +KFR_INTRINSIC f32sse add(const f32sse& x, const f32sse& y) { return f32sse(_mm_add_ps(x.v, y.v)); } +KFR_INTRINSIC f32sse sub(const f32sse& x, const f32sse& y) { return f32sse(_mm_sub_ps(x.v, y.v)); } +KFR_INTRINSIC f32sse mul(const f32sse& x, const f32sse& y) { return f32sse(_mm_mul_ps(x.v, y.v)); } +KFR_INTRINSIC f32sse div(const f32sse& x, const f32sse& y) { return f32sse(_mm_div_ps(x.v, y.v)); } + +KFR_INTRINSIC f64sse add(const f64sse& x, const f64sse& y) { return f64sse(_mm_add_pd(x.v, y.v)); } +KFR_INTRINSIC f64sse sub(const f64sse& x, const f64sse& y) { return f64sse(_mm_sub_pd(x.v, y.v)); } +KFR_INTRINSIC f64sse mul(const f64sse& x, const f64sse& y) { return f64sse(_mm_mul_pd(x.v, y.v)); } +KFR_INTRINSIC f64sse div(const f64sse& x, const f64sse& y) { return f64sse(_mm_div_pd(x.v, y.v)); } + +KFR_INTRINSIC u8sse add(const u8sse& x, const u8sse& y) { return _mm_add_epi8(x.v, y.v); } +KFR_INTRINSIC u8sse sub(const u8sse& x, const u8sse& y) { return _mm_sub_epi8(x.v, y.v); } +KFR_INTRINSIC u8sse div(const u8sse& x, const u8sse& y) +{ + KFR_COMPONENTWISE_RET_I(u8sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC i8sse add(const i8sse& x, const i8sse& y) { return _mm_add_epi8(x.v, y.v); } +KFR_INTRINSIC i8sse sub(const i8sse& x, const i8sse& y) { return _mm_sub_epi8(x.v, y.v); } +KFR_INTRINSIC i8sse div(const i8sse& x, const i8sse& y) +{ + KFR_COMPONENTWISE_RET_I(i8sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC __m128i mul_epi8(const __m128i& x, const __m128i& y) +{ + const __m128i even = _mm_mullo_epi16(x, y); + const __m128i odd = _mm_mullo_epi16(_mm_srli_epi16(x, 8), _mm_srli_epi16(y, 8)); + return _mm_or_si128(_mm_slli_epi16(odd, 8), _mm_srli_epi16(_mm_slli_epi16(even, 8), 8)); +} + +KFR_INTRINSIC u8sse mul(const u8sse& x, const u8sse& y) { return mul_epi8(x.v, y.v); } + +KFR_INTRINSIC i8sse mul(const i8sse& x, const i8sse& y) { return mul_epi8(x.v, y.v); } + +KFR_INTRINSIC u16sse add(const u16sse& x, const u16sse& y) { return _mm_add_epi16(x.v, y.v); } +KFR_INTRINSIC u16sse sub(const u16sse& x, const u16sse& y) { return _mm_sub_epi16(x.v, y.v); } +KFR_INTRINSIC u16sse mul(const u16sse& x, const u16sse& y) { return _mm_mullo_epi16(x.v, y.v); } +KFR_INTRINSIC u16sse div(const u16sse& x, const u16sse& y) +{ + KFR_COMPONENTWISE_RET_I(u16sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC i16sse add(const i16sse& x, const i16sse& y) { return _mm_add_epi16(x.v, y.v); } +KFR_INTRINSIC i16sse sub(const i16sse& x, const i16sse& y) { return _mm_sub_epi16(x.v, y.v); } +KFR_INTRINSIC i16sse mul(const i16sse& x, const i16sse& y) { return _mm_mullo_epi16(x.v, y.v); } +KFR_INTRINSIC i16sse div(const i16sse& x, const i16sse& y) +{ + KFR_COMPONENTWISE_RET_I(i16sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC u32sse add(const u32sse& x, const u32sse& y) { return _mm_add_epi32(x.v, y.v); } +KFR_INTRINSIC u32sse sub(const u32sse& x, const u32sse& y) { return _mm_sub_epi32(x.v, y.v); } + +KFR_INTRINSIC i32sse add(const i32sse& x, const i32sse& y) { return _mm_add_epi32(x.v, y.v); } +KFR_INTRINSIC i32sse sub(const i32sse& x, const i32sse& y) { return _mm_sub_epi32(x.v, y.v); } + +#if defined CMT_ARCH_SSE41 +KFR_INTRINSIC u32sse mul(const u32sse& x, const u32sse& y) { return _mm_mullo_epi32(x.v, y.v); } +KFR_INTRINSIC i32sse mul(const i32sse& x, const i32sse& y) { return _mm_mullo_epi32(x.v, y.v); } +#else +KFR_INTRINSIC u32sse mul(const u32sse& x, const u32sse& y) +{ + __m128i tmp1 = _mm_mul_epu32(x.v, y.v); + __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x.v, 4), _mm_srli_si128(y.v, 4)); + return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), + _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); +} +KFR_INTRINSIC i32sse mul(const i32sse& x, const i32sse& y) +{ + __m128i tmp1 = _mm_mul_epu32(x.v, y.v); + __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x.v, 4), _mm_srli_si128(y.v, 4)); + return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), + _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); +} +#endif +KFR_INTRINSIC u32sse div(const u32sse& x, const u32sse& y) +{ + KFR_COMPONENTWISE_RET_I(u32sse, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC i32sse div(const i32sse& x, const i32sse& y) +{ + KFR_COMPONENTWISE_RET_I(i32sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC u64sse add(const u64sse& x, const u64sse& y) { return _mm_add_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse sub(const u64sse& x, const u64sse& y) { return _mm_sub_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse mul(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = x[i] * y[i]); +} +KFR_INTRINSIC u64sse div(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC i64sse add(const i64sse& x, const i64sse& y) { return _mm_add_epi64(x.v, y.v); } +KFR_INTRINSIC i64sse sub(const i64sse& x, const i64sse& y) { return _mm_sub_epi64(x.v, y.v); } +KFR_INTRINSIC i64sse mul(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = x[i] * y[i]); +} +KFR_INTRINSIC i64sse div(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC f32sse shl(const f32sse& x, unsigned y) +{ + return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(x.v), y)); +} +KFR_INTRINSIC f64sse shl(const f64sse& x, unsigned y) +{ + return _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(x.v), y)); +} +KFR_INTRINSIC f32sse shr(const f32sse& x, unsigned y) +{ + return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(x.v), y)); +} +KFR_INTRINSIC f64sse shr(const f64sse& x, unsigned y) +{ + return _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(x.v), y)); +} + +KFR_INTRINSIC u16sse shl(const u16sse& x, unsigned y) { return _mm_slli_epi16(x.v, y); } +KFR_INTRINSIC u32sse shl(const u32sse& x, unsigned y) { return _mm_slli_epi32(x.v, y); } +KFR_INTRINSIC u64sse shl(const u64sse& x, unsigned y) { return _mm_slli_epi64(x.v, y); } +KFR_INTRINSIC i16sse shl(const i16sse& x, unsigned y) { return _mm_slli_epi16(x.v, y); } +KFR_INTRINSIC i32sse shl(const i32sse& x, unsigned y) { return _mm_slli_epi32(x.v, y); } +KFR_INTRINSIC i64sse shl(const i64sse& x, unsigned y) { return _mm_slli_epi64(x.v, y); } + +KFR_INTRINSIC u16sse shr(const u16sse& x, unsigned y) { return _mm_srli_epi16(x.v, y); } +KFR_INTRINSIC u32sse shr(const u32sse& x, unsigned y) { return _mm_srli_epi32(x.v, y); } +KFR_INTRINSIC u64sse shr(const u64sse& x, unsigned y) { return _mm_srli_epi64(x.v, y); } +KFR_INTRINSIC i16sse shr(const i16sse& x, unsigned y) { return _mm_srai_epi16(x.v, y); } +KFR_INTRINSIC i32sse shr(const i32sse& x, unsigned y) { return _mm_srai_epi32(x.v, y); } + +KFR_INTRINSIC u8sse shl(const u8sse& x, unsigned y) +{ + __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v); + __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v); + + __m128i ll = _mm_slli_epi16(l, y); + __m128i hh = _mm_slli_epi16(h, y); + + return _mm_packs_epi16(ll, hh); +} +KFR_INTRINSIC i8sse shl(const i8sse& x, unsigned y) +{ + __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v); + __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v); + + __m128i ll = _mm_slli_epi16(l, y); + __m128i hh = _mm_slli_epi16(h, y); + + return _mm_packs_epi16(ll, hh); +} +KFR_INTRINSIC u8sse shr(const u8sse& x, unsigned y) +{ + __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v); + __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v); + + __m128i ll = _mm_srli_epi16(l, y); + __m128i hh = _mm_srli_epi16(h, y); + + return _mm_packs_epi16(ll, hh); +} +KFR_INTRINSIC i8sse shr(const i8sse& x, unsigned y) +{ + __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v); + __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v); + + __m128i ll = _mm_srai_epi16(l, y); + __m128i hh = _mm_srai_epi16(h, y); + + return _mm_packs_epi16(ll, hh); +} + +KFR_INTRINSIC i64sse shr(const i64sse& x, unsigned y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = x[i] >> y); +} + +template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y[i]))); +} +template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y[i]))); +} + +KFR_INTRINSIC f32sse band(const f32sse& x, const f32sse& y) { return _mm_and_ps(x.v, y.v); } +KFR_INTRINSIC f64sse band(const f64sse& x, const f64sse& y) { return _mm_and_pd(x.v, y.v); } + +KFR_INTRINSIC u8sse band(const u8sse& x, const u8sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC u16sse band(const u16sse& x, const u16sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC u32sse band(const u32sse& x, const u32sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC u64sse band(const u64sse& x, const u64sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC i8sse band(const i8sse& x, const i8sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC i16sse band(const i16sse& x, const i16sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC i32sse band(const i32sse& x, const i32sse& y) { return _mm_and_si128(x.v, y.v); } +KFR_INTRINSIC i64sse band(const i64sse& x, const i64sse& y) { return _mm_and_si128(x.v, y.v); } + +KFR_INTRINSIC f32sse bor(const f32sse& x, const f32sse& y) { return _mm_or_ps(x.v, y.v); } +KFR_INTRINSIC f64sse bor(const f64sse& x, const f64sse& y) { return _mm_or_pd(x.v, y.v); } + +KFR_INTRINSIC u8sse bor(const u8sse& x, const u8sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC u16sse bor(const u16sse& x, const u16sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC u32sse bor(const u32sse& x, const u32sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC u64sse bor(const u64sse& x, const u64sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC i8sse bor(const i8sse& x, const i8sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC i16sse bor(const i16sse& x, const i16sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC i32sse bor(const i32sse& x, const i32sse& y) { return _mm_or_si128(x.v, y.v); } +KFR_INTRINSIC i64sse bor(const i64sse& x, const i64sse& y) { return _mm_or_si128(x.v, y.v); } + +KFR_INTRINSIC f32sse bxor(const f32sse& x, const f32sse& y) { return _mm_xor_ps(x.v, y.v); } +KFR_INTRINSIC f64sse bxor(const f64sse& x, const f64sse& y) { return _mm_xor_pd(x.v, y.v); } + +KFR_INTRINSIC u8sse bxor(const u8sse& x, const u8sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC u16sse bxor(const u16sse& x, const u16sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC u32sse bxor(const u32sse& x, const u32sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC u64sse bxor(const u64sse& x, const u64sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC i8sse bxor(const i8sse& x, const i8sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC i16sse bxor(const i16sse& x, const i16sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC i32sse bxor(const i32sse& x, const i32sse& y) { return _mm_xor_si128(x.v, y.v); } +KFR_INTRINSIC i64sse bxor(const i64sse& x, const i64sse& y) { return _mm_xor_si128(x.v, y.v); } + +KFR_INTRINSIC f32sse eq(const f32sse& x, const f32sse& y) { return _mm_cmpeq_ps(x.v, y.v); } +KFR_INTRINSIC f64sse eq(const f64sse& x, const f64sse& y) { return _mm_cmpeq_pd(x.v, y.v); } +KFR_INTRINSIC u8sse eq(const u8sse& x, const u8sse& y) { return _mm_cmpeq_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse eq(const u16sse& x, const u16sse& y) { return _mm_cmpeq_epi16(x.v, y.v); } +KFR_INTRINSIC u32sse eq(const u32sse& x, const u32sse& y) { return _mm_cmpeq_epi32(x.v, y.v); } +KFR_INTRINSIC i8sse eq(const i8sse& x, const i8sse& y) { return _mm_cmpeq_epi8(x.v, y.v); } +KFR_INTRINSIC i16sse eq(const i16sse& x, const i16sse& y) { return _mm_cmpeq_epi16(x.v, y.v); } +KFR_INTRINSIC i32sse eq(const i32sse& x, const i32sse& y) { return _mm_cmpeq_epi32(x.v, y.v); } + +KFR_INTRINSIC f32sse ne(const f32sse& x, const f32sse& y) { return _mm_not_ps(_mm_cmpeq_ps(x.v, y.v)); } +KFR_INTRINSIC f64sse ne(const f64sse& x, const f64sse& y) { return _mm_not_pd(_mm_cmpeq_pd(x.v, y.v)); } +KFR_INTRINSIC u8sse ne(const u8sse& x, const u8sse& y) { return _mm_not_si128(_mm_cmpeq_epi8(x.v, y.v)); } +KFR_INTRINSIC u16sse ne(const u16sse& x, const u16sse& y) { return _mm_not_si128(_mm_cmpeq_epi16(x.v, y.v)); } +KFR_INTRINSIC u32sse ne(const u32sse& x, const u32sse& y) { return _mm_not_si128(_mm_cmpeq_epi32(x.v, y.v)); } +KFR_INTRINSIC i8sse ne(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmpeq_epi8(x.v, y.v)); } +KFR_INTRINSIC i16sse ne(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmpeq_epi16(x.v, y.v)); } +KFR_INTRINSIC i32sse ne(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmpeq_epi32(x.v, y.v)); } + +KFR_INTRINSIC f32sse lt(const f32sse& x, const f32sse& y) { return _mm_cmplt_ps(x.v, y.v); } +KFR_INTRINSIC f64sse lt(const f64sse& x, const f64sse& y) { return _mm_cmplt_pd(x.v, y.v); } +KFR_INTRINSIC i8sse lt(const i8sse& x, const i8sse& y) { return _mm_cmplt_epi8(x.v, y.v); } +KFR_INTRINSIC i16sse lt(const i16sse& x, const i16sse& y) { return _mm_cmplt_epi16(x.v, y.v); } +KFR_INTRINSIC i32sse lt(const i32sse& x, const i32sse& y) { return _mm_cmplt_epi32(x.v, y.v); } + +KFR_INTRINSIC u8sse lt(const u8sse& x, const u8sse& y) +{ + const __m128i hb = _mm_highbit_epi8(); + return _mm_cmplt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb)); +} + +KFR_INTRINSIC u16sse lt(const u16sse& x, const u16sse& y) +{ + const __m128i hb = _mm_highbit_epi16(); + return _mm_cmplt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb)); +} +KFR_INTRINSIC u32sse lt(const u32sse& x, const u32sse& y) +{ + const __m128i hb = _mm_highbit_epi32(); + return _mm_cmplt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb)); +} + +KFR_INTRINSIC f32sse gt(const f32sse& x, const f32sse& y) { return _mm_cmpgt_ps(x.v, y.v); } +KFR_INTRINSIC f64sse gt(const f64sse& x, const f64sse& y) { return _mm_cmpgt_pd(x.v, y.v); } +KFR_INTRINSIC i8sse gt(const i8sse& x, const i8sse& y) { return _mm_cmpgt_epi8(x.v, y.v); } +KFR_INTRINSIC i16sse gt(const i16sse& x, const i16sse& y) { return _mm_cmpgt_epi16(x.v, y.v); } +KFR_INTRINSIC i32sse gt(const i32sse& x, const i32sse& y) { return _mm_cmpgt_epi32(x.v, y.v); } + +KFR_INTRINSIC u8sse gt(const u8sse& x, const u8sse& y) +{ + const __m128i hb = _mm_highbit_epi8(); + return _mm_cmpgt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb)); +} + +KFR_INTRINSIC u16sse gt(const u16sse& x, const u16sse& y) +{ + const __m128i hb = _mm_highbit_epi16(); + return _mm_cmpgt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb)); +} +KFR_INTRINSIC u32sse gt(const u32sse& x, const u32sse& y) +{ + const __m128i hb = _mm_highbit_epi32(); + return _mm_cmpgt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb)); +} + +KFR_INTRINSIC f32sse le(const f32sse& x, const f32sse& y) { return _mm_cmple_ps(x.v, y.v); } +KFR_INTRINSIC f64sse le(const f64sse& x, const f64sse& y) { return _mm_cmple_pd(x.v, y.v); } +KFR_INTRINSIC i8sse le(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmpgt_epi8(x.v, y.v)); } +KFR_INTRINSIC i16sse le(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmpgt_epi16(x.v, y.v)); } +KFR_INTRINSIC i32sse le(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmpgt_epi32(x.v, y.v)); } + +KFR_INTRINSIC u8sse le(const u8sse& x, const u8sse& y) +{ + const __m128i hb = _mm_highbit_epi8(); + return _mm_not_si128(_mm_cmpgt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb))); +} + +KFR_INTRINSIC u16sse le(const u16sse& x, const u16sse& y) +{ + const __m128i hb = _mm_highbit_epi16(); + return _mm_not_si128(_mm_cmpgt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb))); +} +KFR_INTRINSIC u32sse le(const u32sse& x, const u32sse& y) +{ + const __m128i hb = _mm_highbit_epi32(); + return _mm_not_si128(_mm_cmpgt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb))); +} + +KFR_INTRINSIC f32sse ge(const f32sse& x, const f32sse& y) { return _mm_cmpge_ps(x.v, y.v); } +KFR_INTRINSIC f64sse ge(const f64sse& x, const f64sse& y) { return _mm_cmpge_pd(x.v, y.v); } +KFR_INTRINSIC i8sse ge(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmplt_epi8(x.v, y.v)); } +KFR_INTRINSIC i16sse ge(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmplt_epi16(x.v, y.v)); } +KFR_INTRINSIC i32sse ge(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmplt_epi32(x.v, y.v)); } + +KFR_INTRINSIC u8sse ge(const u8sse& x, const u8sse& y) +{ + const __m128i hb = _mm_highbit_epi8(); + return _mm_not_si128(_mm_cmplt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb))); +} + +KFR_INTRINSIC u16sse ge(const u16sse& x, const u16sse& y) +{ + const __m128i hb = _mm_highbit_epi16(); + return _mm_not_si128(_mm_cmplt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb))); +} +KFR_INTRINSIC u32sse ge(const u32sse& x, const u32sse& y) +{ + const __m128i hb = _mm_highbit_epi32(); + return _mm_not_si128(_mm_cmplt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb))); +} + +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS +KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y) { return _mm_cmpeq_epi64(x.v, y.v); } +KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y) { return _mm_cmpeq_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y) { return _mm_not_si128(_mm_cmpeq_epi64(x.v, y.v)); } +KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpeq_epi64(x.v, y.v)); } +#else +KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] == y[i])); +} +KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] == y[i])); +} +KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] != y[i])); +} +KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] != y[i])); +} +#endif + +#if defined CMT_ARCH_SSE42 +KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y) { return _mm_cmpgt_epi64(x.v, y.v); } +KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y) { return _mm_cmpgt_epi64(y.v, x.v); } +KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpgt_epi64(y.v, x.v)); } +KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpgt_epi64(x.v, y.v)); } + +KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y) +{ + const __m128i hb = _mm_highbit_epi64(); + return _mm_cmpgt_epi64(_mm_add_epi64(x.v, hb), _mm_add_epi64(y.v, hb)); +} +KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y) +{ + const __m128i hb = _mm_highbit_epi64(); + return _mm_cmpgt_epi64(_mm_add_epi64(y.v, hb), _mm_add_epi64(x.v, hb)); +} +KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y) +{ + const __m128i hb = _mm_highbit_epi64(); + return _mm_not_si128(_mm_cmpgt_epi64(_mm_add_epi64(y.v, hb), _mm_add_epi64(x.v, hb))); +} +KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y) +{ + const __m128i hb = _mm_highbit_epi64(); + return _mm_not_si128(_mm_cmpgt_epi64(_mm_add_epi64(x.v, hb), _mm_add_epi64(y.v, hb))); +} + +#else +KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] > y[i])); +} +KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] > y[i])); +} +KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] < y[i])); +} +KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] < y[i])); +} +KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] >= y[i])); +} +KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] >= y[i])); +} +KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] <= y[i])); +} +KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] <= y[i])); +} +#endif + +#if defined CMT_ARCH_AVX + +KFR_INTRINSIC f32avx add(const f32avx& x, const f32avx& y) { return f32avx(_mm256_add_ps(x.v, y.v)); } +KFR_INTRINSIC f64avx add(const f64avx& x, const f64avx& y) { return f64avx(_mm256_add_pd(x.v, y.v)); } +KFR_INTRINSIC f32avx sub(const f32avx& x, const f32avx& y) { return f32avx(_mm256_sub_ps(x.v, y.v)); } +KFR_INTRINSIC f64avx sub(const f64avx& x, const f64avx& y) { return f64avx(_mm256_sub_pd(x.v, y.v)); } +KFR_INTRINSIC f32avx mul(const f32avx& x, const f32avx& y) { return f32avx(_mm256_mul_ps(x.v, y.v)); } +KFR_INTRINSIC f64avx mul(const f64avx& x, const f64avx& y) { return f64avx(_mm256_mul_pd(x.v, y.v)); } +KFR_INTRINSIC f32avx div(const f32avx& x, const f32avx& y) { return f32avx(_mm256_div_ps(x.v, y.v)); } +KFR_INTRINSIC f64avx div(const f64avx& x, const f64avx& y) { return f64avx(_mm256_div_pd(x.v, y.v)); } + +KFR_INTRINSIC __m256 _mm256_allones_ps() +{ + return _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_UQ); +} + +KFR_INTRINSIC __m256d _mm256_allones_pd() +{ + return _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_UQ); +} + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC __m256i _mm256_allones_si256() +{ + return _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256()); +} +#else +KFR_INTRINSIC __m256i _mm256_allones_si256() +{ + return _mm256_castps_si256(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_UQ)); +} +#endif + +KFR_INTRINSIC __m256 _mm256_not_ps(const __m256& x) { return _mm256_xor_ps(x, _mm256_allones_ps()); } +KFR_INTRINSIC __m256d _mm256_not_pd(const __m256d& x) { return _mm256_xor_pd(x, _mm256_allones_pd()); } +KFR_INTRINSIC __m256i _mm256_not_si256(const __m256i& x) +{ + return _mm256_xor_si256(x, _mm256_allones_si256()); +} + +KFR_INTRINSIC __m256i _mm256_highbit_epi8() { return _mm256_set1_epi8(static_cast<char>(0x80)); } +KFR_INTRINSIC __m256i _mm256_highbit_epi16() { return _mm256_set1_epi16(static_cast<short>(0x8000)); } +KFR_INTRINSIC __m256i _mm256_highbit_epi32() { return _mm256_set1_epi32(static_cast<int>(0x80000000)); } +KFR_INTRINSIC __m256i _mm256_highbit_epi64() { return _mm256_set1_epi64x(0x8000000000000000ll); } + +KFR_INTRINSIC f32avx eq(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_EQ_OQ); } +KFR_INTRINSIC f64avx eq(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_EQ_OQ); } +KFR_INTRINSIC f32avx ne(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_NEQ_OQ); } +KFR_INTRINSIC f64avx ne(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_NEQ_OQ); } +KFR_INTRINSIC f32avx lt(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_LT_OQ); } +KFR_INTRINSIC f64avx lt(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_LT_OQ); } +KFR_INTRINSIC f32avx gt(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_GT_OQ); } +KFR_INTRINSIC f64avx gt(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_GT_OQ); } +KFR_INTRINSIC f32avx le(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_LE_OQ); } +KFR_INTRINSIC f64avx le(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_LE_OQ); } +KFR_INTRINSIC f32avx ge(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_GE_OQ); } +KFR_INTRINSIC f64avx ge(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_GE_OQ); } + +KFR_INTRINSIC f32avx band(const f32avx& x, const f32avx& y) { return _mm256_and_ps(x.v, y.v); } +KFR_INTRINSIC f64avx band(const f64avx& x, const f64avx& y) { return _mm256_and_pd(x.v, y.v); } +KFR_INTRINSIC f32avx bor(const f32avx& x, const f32avx& y) { return _mm256_or_ps(x.v, y.v); } +KFR_INTRINSIC f64avx bor(const f64avx& x, const f64avx& y) { return _mm256_or_pd(x.v, y.v); } +KFR_INTRINSIC f32avx bxor(const f32avx& x, const f32avx& y) { return _mm256_xor_ps(x.v, y.v); } +KFR_INTRINSIC f64avx bxor(const f64avx& x, const f64avx& y) { return _mm256_xor_pd(x.v, y.v); } + +KFR_INTRINSIC f32avx shl(const f32avx& x, unsigned y) +{ +#if defined CMT_ARCH_AVX2 + return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(x.v), y)); +#else + return _mm256_setr_m128( + _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(x.v)), y)), + _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(x.v, 1)), y))); +#endif +} +KFR_INTRINSIC f64avx shl(const f64avx& x, unsigned y) +{ +#if defined CMT_ARCH_AVX2 + return _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(x.v), y)); +#else + return _mm256_setr_m128d( + _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(_mm256_castpd256_pd128(x.v)), y)), + _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(_mm256_extractf128_pd(x.v, 1)), y))); +#endif +} +KFR_INTRINSIC f32avx shr(const f32avx& x, unsigned y) +{ +#if defined CMT_ARCH_AVX2 + return _mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(x.v), y)); +#else + return _mm256_setr_m128( + _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(_mm256_castps256_ps128(x.v)), y)), + _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(_mm256_extractf128_ps(x.v, 1)), y))); +#endif +} +KFR_INTRINSIC f64avx shr(const f64avx& x, unsigned y) +{ +#if defined CMT_ARCH_AVX2 + return _mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(x.v), y)); +#else + return _mm256_setr_m128d( + _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(_mm256_castpd256_pd128(x.v)), y)), + _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(_mm256_extractf128_pd(x.v, 1)), y))); +#endif +} + +#if defined CMT_ARCH_AVX2 + +KFR_INTRINSIC u8avx add(const u8avx& x, const u8avx& y) { return _mm256_add_epi8(x.v, y.v); } +KFR_INTRINSIC u8avx sub(const u8avx& x, const u8avx& y) { return _mm256_sub_epi8(x.v, y.v); } +KFR_INTRINSIC u8avx div(const u8avx& x, const u8avx& y) +{ + KFR_COMPONENTWISE_RET_I(u8avx, result[i] = x[i] / y[i]); +} + +KFR_INTRINSIC i8avx add(const i8avx& x, const i8avx& y) { return _mm256_add_epi8(x.v, y.v); } +KFR_INTRINSIC i8avx sub(const i8avx& x, const i8avx& y) { return _mm256_sub_epi8(x.v, y.v); } +KFR_INTRINSIC i8avx div(const i8avx& x, const i8avx& y) +{ + KFR_COMPONENTWISE_RET_I(i8avx, result[i] = x[i] / y[i]); +} + +KFR_INTRINSIC u16avx add(const u16avx& x, const u16avx& y) { return _mm256_add_epi16(x.v, y.v); } +KFR_INTRINSIC u16avx sub(const u16avx& x, const u16avx& y) { return _mm256_sub_epi16(x.v, y.v); } +KFR_INTRINSIC u16avx mul(const u16avx& x, const u16avx& y) { return _mm256_mullo_epi16(x.v, y.v); } +KFR_INTRINSIC u16avx div(const u16avx& x, const u16avx& y) +{ + KFR_COMPONENTWISE_RET_I(u16avx, result[i] = x[i] / y[i]); +} + +KFR_INTRINSIC i16avx add(const i16avx& x, const i16avx& y) { return _mm256_add_epi16(x.v, y.v); } +KFR_INTRINSIC i16avx sub(const i16avx& x, const i16avx& y) { return _mm256_sub_epi16(x.v, y.v); } +KFR_INTRINSIC i16avx mul(const i16avx& x, const i16avx& y) { return _mm256_mullo_epi16(x.v, y.v); } +KFR_INTRINSIC i16avx div(const i16avx& x, const i16avx& y) +{ + KFR_COMPONENTWISE_RET_I(i16avx, result[i] = x[i] / y[i]); +} + +KFR_INTRINSIC u32avx add(const u32avx& x, const u32avx& y) { return _mm256_add_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx sub(const u32avx& x, const u32avx& y) { return _mm256_sub_epi32(x.v, y.v); } + +KFR_INTRINSIC i32avx add(const i32avx& x, const i32avx& y) { return _mm256_add_epi32(x.v, y.v); } +KFR_INTRINSIC i32avx sub(const i32avx& x, const i32avx& y) { return _mm256_sub_epi32(x.v, y.v); } + +KFR_INTRINSIC u32avx mul(const u32avx& x, const u32avx& y) { return _mm256_mullo_epi32(x.v, y.v); } +KFR_INTRINSIC i32avx mul(const i32avx& x, const i32avx& y) { return _mm256_mullo_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx div(const u32avx& x, const u32avx& y) +{ + KFR_COMPONENTWISE_RET_I(u32avx, result[i] = x[i] / y[i]); +} +KFR_INTRINSIC i32avx div(const i32avx& x, const i32avx& y) +{ + KFR_COMPONENTWISE_RET_I(i32avx, result[i] = x[i] / y[i]); +} + +KFR_INTRINSIC u64avx add(const u64avx& x, const u64avx& y) { return _mm256_add_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx sub(const u64avx& x, const u64avx& y) { return _mm256_sub_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx mul(const u64avx& x, const u64avx& y) +{ + KFR_COMPONENTWISE_RET_I(u64avx, result[i] = x[i] * y[i]); +} +KFR_INTRINSIC u64avx div(const u64avx& x, const u64avx& y) +{ + KFR_COMPONENTWISE_RET_I(u64avx, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC i64avx add(const i64avx& x, const i64avx& y) { return _mm256_add_epi64(x.v, y.v); } +KFR_INTRINSIC i64avx sub(const i64avx& x, const i64avx& y) { return _mm256_sub_epi64(x.v, y.v); } +KFR_INTRINSIC i64avx mul(const i64avx& x, const i64avx& y) +{ + KFR_COMPONENTWISE_RET_I(i64avx, result[i] = x[i] * y[i]); +} +KFR_INTRINSIC i64avx div(const i64avx& x, const i64avx& y) +{ + KFR_COMPONENTWISE_RET_I(i64avx, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC __m256i mul_epi8(const __m256i& x, const __m256i& y) +{ + const __m256i even = _mm256_mullo_epi16(x, y); + const __m256i odd = _mm256_mullo_epi16(_mm256_srli_epi16(x, 8), _mm256_srli_epi16(y, 8)); + return _mm256_or_si256(_mm256_slli_epi16(odd, 8), _mm256_srli_epi16(_mm256_slli_epi16(even, 8), 8)); +} + +KFR_INTRINSIC u8avx mul(const u8avx& x, const u8avx& y) { return mul_epi8(x.v, y.v); } +KFR_INTRINSIC i8avx mul(const i8avx& x, const i8avx& y) { return mul_epi8(x.v, y.v); } + +KFR_INTRINSIC u8avx band(const u8avx& x, const u8avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC u16avx band(const u16avx& x, const u16avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC u32avx band(const u32avx& x, const u32avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC u64avx band(const u64avx& x, const u64avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC i8avx band(const i8avx& x, const i8avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC i16avx band(const i16avx& x, const i16avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC i32avx band(const i32avx& x, const i32avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC i64avx band(const i64avx& x, const i64avx& y) { return _mm256_and_si256(x.v, y.v); } +KFR_INTRINSIC u8avx bor(const u8avx& x, const u8avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC u16avx bor(const u16avx& x, const u16avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC u32avx bor(const u32avx& x, const u32avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC u64avx bor(const u64avx& x, const u64avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC i8avx bor(const i8avx& x, const i8avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC i16avx bor(const i16avx& x, const i16avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC i32avx bor(const i32avx& x, const i32avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC i64avx bor(const i64avx& x, const i64avx& y) { return _mm256_or_si256(x.v, y.v); } +KFR_INTRINSIC u8avx bxor(const u8avx& x, const u8avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC u16avx bxor(const u16avx& x, const u16avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC u32avx bxor(const u32avx& x, const u32avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC u64avx bxor(const u64avx& x, const u64avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC i8avx bxor(const i8avx& x, const i8avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC i16avx bxor(const i16avx& x, const i16avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC i32avx bxor(const i32avx& x, const i32avx& y) { return _mm256_xor_si256(x.v, y.v); } +KFR_INTRINSIC i64avx bxor(const i64avx& x, const i64avx& y) { return _mm256_xor_si256(x.v, y.v); } + +KFR_INTRINSIC u16avx shl(const u16avx& x, unsigned y) { return _mm256_slli_epi16(x.v, y); } +KFR_INTRINSIC u32avx shl(const u32avx& x, unsigned y) { return _mm256_slli_epi32(x.v, y); } +KFR_INTRINSIC i16avx shl(const i16avx& x, unsigned y) { return _mm256_slli_epi16(x.v, y); } +KFR_INTRINSIC i32avx shl(const i32avx& x, unsigned y) { return _mm256_slli_epi32(x.v, y); } +KFR_INTRINSIC u16avx shr(const u16avx& x, unsigned y) { return _mm256_srli_epi16(x.v, y); } +KFR_INTRINSIC u32avx shr(const u32avx& x, unsigned y) { return _mm256_srli_epi32(x.v, y); } +KFR_INTRINSIC i16avx shr(const i16avx& x, unsigned y) { return _mm256_srai_epi16(x.v, y); } +KFR_INTRINSIC i32avx shr(const i32avx& x, unsigned y) { return _mm256_srai_epi32(x.v, y); } + +KFR_INTRINSIC u64avx shl(const u64avx& x, unsigned y) { return _mm256_slli_epi64(x.v, y); } +KFR_INTRINSIC u64avx shr(const u64avx& x, unsigned y) { return _mm256_srli_epi64(x.v, y); } +KFR_INTRINSIC i64avx shl(const i64avx& x, unsigned y) { return _mm256_slli_epi64(x.v, y); } +KFR_INTRINSIC i64avx shr(const i64avx& x, unsigned y) +{ + KFR_COMPONENTWISE_RET_I(u64avx, result[i] = x[i] >> y); +} + +KFR_INTRINSIC u8avx shl(const u8avx& x, unsigned y) +{ + __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v); + __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v); + __m256i ll = _mm256_slli_epi16(l, y); + __m256i hh = _mm256_slli_epi16(h, y); + + return _mm256_packs_epi16(ll, hh); +} +KFR_INTRINSIC i8avx shl(const i8avx& x, unsigned y) +{ + __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v); + __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v); + __m256i ll = _mm256_slli_epi16(l, y); + __m256i hh = _mm256_slli_epi16(h, y); + + return _mm256_packs_epi16(ll, hh); +} +KFR_INTRINSIC u8avx shr(const u8avx& x, unsigned y) +{ + __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v); + __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v); + __m256i ll = _mm256_srli_epi16(l, y); + __m256i hh = _mm256_srli_epi16(h, y); + + return _mm256_packs_epi16(ll, hh); +} +KFR_INTRINSIC i8avx shr(const i8avx& x, unsigned y) +{ + __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v); + __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v); + __m256i ll = _mm256_srai_epi16(l, y); + __m256i hh = _mm256_srai_epi16(h, y); + + return _mm256_packs_epi16(ll, hh); +} + +KFR_INTRINSIC u32sse shl(const u32sse& x, const u32sse& y) { return _mm_sllv_epi32(x.v, y.v); } +KFR_INTRINSIC i32sse shl(const i32sse& x, const u32sse& y) { return _mm_sllv_epi32(x.v, y.v); } +KFR_INTRINSIC u64sse shl(const u64sse& x, const u64sse& y) { return _mm_sllv_epi64(x.v, y.v); } +KFR_INTRINSIC i64sse shl(const i64sse& x, const u64sse& y) { return _mm_sllv_epi64(x.v, y.v); } + +KFR_INTRINSIC u32avx shl(const u32avx& x, const u32avx& y) { return _mm256_sllv_epi32(x.v, y.v); } +KFR_INTRINSIC i32avx shl(const i32avx& x, const u32avx& y) { return _mm256_sllv_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx shl(const u64avx& x, const u64avx& y) { return _mm256_sllv_epi64(x.v, y.v); } +KFR_INTRINSIC i64avx shl(const i64avx& x, const u64avx& y) { return _mm256_sllv_epi64(x.v, y.v); } + +KFR_INTRINSIC u32sse shr(const u32sse& x, const u32sse& y) { return _mm_srlv_epi32(x.v, y.v); } +KFR_INTRINSIC i32sse shr(const i32sse& x, const u32sse& y) { return _mm_srav_epi32(x.v, y.v); } +KFR_INTRINSIC u64sse shr(const u64sse& x, const u64sse& y) { return _mm_srlv_epi64(x.v, y.v); } +KFR_INTRINSIC i64sse shr(const i64sse& x, const u64sse& y) +{ + KFR_COMPONENTWISE_RET_I(i64sse, result[i] = x[i] >> y[i]); +} + +KFR_INTRINSIC u32avx shr(const u32avx& x, const u32avx& y) { return _mm256_srlv_epi32(x.v, y.v); } +KFR_INTRINSIC i32avx shr(const i32avx& x, const u32avx& y) { return _mm256_srav_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx shr(const u64avx& x, const u64avx& y) { return _mm256_srlv_epi64(x.v, y.v); } +KFR_INTRINSIC i64avx shr(const i64avx& x, const u64avx& y) +{ + KFR_COMPONENTWISE_RET_I(i64avx, result[i] = x[i] >> y[i]); +} + +KFR_INTRINSIC f32sse shl(const f32sse& x, const u32sse& y) +{ + return _mm_castsi128_ps(_mm_sllv_epi32(_mm_castps_si128(x.v), y.v)); +} +KFR_INTRINSIC f64sse shl(const f64sse& x, const u64sse& y) +{ + return _mm_castsi128_pd(_mm_sllv_epi64(_mm_castpd_si128(x.v), y.v)); +} +KFR_INTRINSIC f32sse shr(const f32sse& x, const u32sse& y) +{ + return _mm_castsi128_ps(_mm_srlv_epi32(_mm_castps_si128(x.v), y.v)); +} +KFR_INTRINSIC f64sse shr(const f64sse& x, const u64sse& y) +{ + return _mm_castsi128_pd(_mm_srlv_epi64(_mm_castpd_si128(x.v), y.v)); +} + +KFR_INTRINSIC f32avx shl(const f32avx& x, const u32avx& y) +{ + return _mm256_castsi256_ps(_mm256_sllv_epi32(_mm256_castps_si256(x.v), y.v)); +} +KFR_INTRINSIC f64avx shl(const f64avx& x, const u64avx& y) +{ + return _mm256_castsi256_pd(_mm256_sllv_epi64(_mm256_castpd_si256(x.v), y.v)); +} +KFR_INTRINSIC f32avx shr(const f32avx& x, const u32avx& y) +{ + return _mm256_castsi256_ps(_mm256_srlv_epi32(_mm256_castps_si256(x.v), y.v)); +} +KFR_INTRINSIC f64avx shr(const f64avx& x, const u64avx& y) +{ + return _mm256_castsi256_pd(_mm256_srlv_epi64(_mm256_castpd_si256(x.v), y.v)); +} + +KFR_INTRINSIC i8avx eq(const i8avx& x, const i8avx& y) { return _mm256_cmpeq_epi8(x.v, y.v); } +KFR_INTRINSIC i16avx eq(const i16avx& x, const i16avx& y) { return _mm256_cmpeq_epi16(x.v, y.v); } +KFR_INTRINSIC i32avx eq(const i32avx& x, const i32avx& y) { return _mm256_cmpeq_epi32(x.v, y.v); } +KFR_INTRINSIC i64avx eq(const i64avx& x, const i64avx& y) { return _mm256_cmpeq_epi64(x.v, y.v); } +KFR_INTRINSIC u8avx eq(const u8avx& x, const u8avx& y) { return _mm256_cmpeq_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx eq(const u16avx& x, const u16avx& y) { return _mm256_cmpeq_epi16(x.v, y.v); } +KFR_INTRINSIC u32avx eq(const u32avx& x, const u32avx& y) { return _mm256_cmpeq_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx eq(const u64avx& x, const u64avx& y) { return _mm256_cmpeq_epi64(x.v, y.v); } + +KFR_INTRINSIC i8avx ne(const i8avx& x, const i8avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi8(x.v, y.v)); +} +KFR_INTRINSIC i16avx ne(const i16avx& x, const i16avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi16(x.v, y.v)); +} +KFR_INTRINSIC i32avx ne(const i32avx& x, const i32avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi32(x.v, y.v)); +} +KFR_INTRINSIC i64avx ne(const i64avx& x, const i64avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi64(x.v, y.v)); +} +KFR_INTRINSIC u8avx ne(const u8avx& x, const u8avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi8(x.v, y.v)); +} +KFR_INTRINSIC u16avx ne(const u16avx& x, const u16avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi16(x.v, y.v)); +} +KFR_INTRINSIC u32avx ne(const u32avx& x, const u32avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi32(x.v, y.v)); +} +KFR_INTRINSIC u64avx ne(const u64avx& x, const u64avx& y) +{ + return _mm256_not_si256(_mm256_cmpeq_epi64(x.v, y.v)); +} + +KFR_INTRINSIC i8avx lt(const i8avx& x, const i8avx& y) { return _mm256_cmpgt_epi8(y.v, x.v); } +KFR_INTRINSIC i16avx lt(const i16avx& x, const i16avx& y) { return _mm256_cmpgt_epi16(y.v, x.v); } +KFR_INTRINSIC i32avx lt(const i32avx& x, const i32avx& y) { return _mm256_cmpgt_epi32(y.v, x.v); } +KFR_INTRINSIC i64avx lt(const i64avx& x, const i64avx& y) { return _mm256_cmpgt_epi64(y.v, x.v); } + +KFR_INTRINSIC i8avx gt(const i8avx& x, const i8avx& y) { return _mm256_cmpgt_epi8(x.v, y.v); } +KFR_INTRINSIC i16avx gt(const i16avx& x, const i16avx& y) { return _mm256_cmpgt_epi16(x.v, y.v); } +KFR_INTRINSIC i32avx gt(const i32avx& x, const i32avx& y) { return _mm256_cmpgt_epi32(x.v, y.v); } +KFR_INTRINSIC i64avx gt(const i64avx& x, const i64avx& y) { return _mm256_cmpgt_epi64(x.v, y.v); } + +KFR_INTRINSIC i8avx le(const i8avx& x, const i8avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi8(x.v, y.v)); +} +KFR_INTRINSIC i16avx le(const i16avx& x, const i16avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi16(x.v, y.v)); +} +KFR_INTRINSIC i32avx le(const i32avx& x, const i32avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi32(x.v, y.v)); +} +KFR_INTRINSIC i64avx le(const i64avx& x, const i64avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi64(x.v, y.v)); +} + +KFR_INTRINSIC i8avx ge(const i8avx& x, const i8avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi8(y.v, x.v)); +} +KFR_INTRINSIC i16avx ge(const i16avx& x, const i16avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi16(y.v, x.v)); +} +KFR_INTRINSIC i32avx ge(const i32avx& x, const i32avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi32(y.v, x.v)); +} +KFR_INTRINSIC i64avx ge(const i64avx& x, const i64avx& y) +{ + return _mm256_not_si256(_mm256_cmpgt_epi64(y.v, x.v)); +} + +KFR_INTRINSIC u8avx lt(const u8avx& x, const u8avx& y) +{ + const __m256i hb = _mm256_highbit_epi8(); + return _mm256_cmpgt_epi8(_mm256_add_epi8(y.v, hb), _mm256_add_epi8(x.v, hb)); +} +KFR_INTRINSIC u16avx lt(const u16avx& x, const u16avx& y) +{ + const __m256i hb = _mm256_highbit_epi16(); + return _mm256_cmpgt_epi16(_mm256_add_epi16(y.v, hb), _mm256_add_epi16(x.v, hb)); +} +KFR_INTRINSIC u32avx lt(const u32avx& x, const u32avx& y) +{ + const __m256i hb = _mm256_highbit_epi32(); + return _mm256_cmpgt_epi32(_mm256_add_epi32(y.v, hb), _mm256_add_epi32(x.v, hb)); +} +KFR_INTRINSIC u64avx lt(const u64avx& x, const u64avx& y) +{ + const __m256i hb = _mm256_highbit_epi64(); + return _mm256_cmpgt_epi64(_mm256_add_epi64(y.v, hb), _mm256_add_epi64(x.v, hb)); +} +KFR_INTRINSIC u8avx gt(const u8avx& x, const u8avx& y) +{ + const __m256i hb = _mm256_highbit_epi8(); + return _mm256_cmpgt_epi8(_mm256_add_epi8(x.v, hb), _mm256_add_epi8(y.v, hb)); +} +KFR_INTRINSIC u16avx gt(const u16avx& x, const u16avx& y) +{ + const __m256i hb = _mm256_highbit_epi16(); + return _mm256_cmpgt_epi16(_mm256_add_epi16(x.v, hb), _mm256_add_epi16(y.v, hb)); +} +KFR_INTRINSIC u32avx gt(const u32avx& x, const u32avx& y) +{ + const __m256i hb = _mm256_highbit_epi32(); + return _mm256_cmpgt_epi32(_mm256_add_epi32(x.v, hb), _mm256_add_epi32(y.v, hb)); +} +KFR_INTRINSIC u64avx gt(const u64avx& x, const u64avx& y) +{ + const __m256i hb = _mm256_highbit_epi64(); + return _mm256_cmpgt_epi64(_mm256_add_epi64(x.v, hb), _mm256_add_epi64(y.v, hb)); +} +KFR_INTRINSIC u8avx le(const u8avx& x, const u8avx& y) +{ + const __m256i hb = _mm256_highbit_epi8(); + return _mm256_not_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(x.v, hb), _mm256_add_epi8(y.v, hb))); +} +KFR_INTRINSIC u16avx le(const u16avx& x, const u16avx& y) +{ + const __m256i hb = _mm256_highbit_epi16(); + return _mm256_not_si256(_mm256_cmpgt_epi16(_mm256_add_epi16(x.v, hb), _mm256_add_epi16(y.v, hb))); +} +KFR_INTRINSIC u32avx le(const u32avx& x, const u32avx& y) +{ + const __m256i hb = _mm256_highbit_epi32(); + return _mm256_not_si256(_mm256_cmpgt_epi32(_mm256_add_epi32(x.v, hb), _mm256_add_epi32(y.v, hb))); +} +KFR_INTRINSIC u64avx le(const u64avx& x, const u64avx& y) +{ + const __m256i hb = _mm256_highbit_epi64(); + return _mm256_not_si256(_mm256_cmpgt_epi64(_mm256_add_epi64(x.v, hb), _mm256_add_epi64(y.v, hb))); +} +KFR_INTRINSIC u8avx ge(const u8avx& x, const u8avx& y) +{ + const __m256i hb = _mm256_highbit_epi8(); + return _mm256_not_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(y.v, hb), _mm256_add_epi8(x.v, hb))); +} +KFR_INTRINSIC u16avx ge(const u16avx& x, const u16avx& y) +{ + const __m256i hb = _mm256_highbit_epi16(); + return _mm256_not_si256(_mm256_cmpgt_epi16(_mm256_add_epi16(y.v, hb), _mm256_add_epi16(x.v, hb))); +} +KFR_INTRINSIC u32avx ge(const u32avx& x, const u32avx& y) +{ + const __m256i hb = _mm256_highbit_epi32(); + return _mm256_not_si256(_mm256_cmpgt_epi32(_mm256_add_epi32(y.v, hb), _mm256_add_epi32(x.v, hb))); +} +KFR_INTRINSIC u64avx ge(const u64avx& x, const u64avx& y) +{ + const __m256i hb = _mm256_highbit_epi64(); + return _mm256_not_si256(_mm256_cmpgt_epi64(_mm256_add_epi64(y.v, hb), _mm256_add_epi64(x.v, hb))); +} + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC f32avx512 add(const f32avx512& x, const f32avx512& y) { return _mm512_add_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 add(const f64avx512& x, const f64avx512& y) { return _mm512_add_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 sub(const f32avx512& x, const f32avx512& y) { return _mm512_sub_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 sub(const f64avx512& x, const f64avx512& y) { return _mm512_sub_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 mul(const f32avx512& x, const f32avx512& y) { return _mm512_mul_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 mul(const f64avx512& x, const f64avx512& y) { return _mm512_mul_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 div(const f32avx512& x, const f32avx512& y) { return _mm512_div_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 div(const f64avx512& x, const f64avx512& y) { return _mm512_div_pd(x.v, y.v); } + +KFR_INTRINSIC __m512 _mm512_allones_ps() +{ + return _mm512_castsi512_ps(_mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), + _mm512_setzero_si512(), 0xFF)); +} + +KFR_INTRINSIC __m512d _mm512_allones_pd() +{ + return _mm512_castsi512_pd(_mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), + _mm512_setzero_si512(), 0xFF)); +} + +KFR_INTRINSIC __m512i _mm512_allones_si512() +{ + return _mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), + 0xFF); +} + +KFR_INTRINSIC __m512 _mm512_not_ps(const __m512& x) { return _mm512_xor_ps(x, _mm512_allones_ps()); } +KFR_INTRINSIC __m512d _mm512_not_pd(const __m512d& x) { return _mm512_xor_pd(x, _mm512_allones_pd()); } +KFR_INTRINSIC __m512i _mm512_not_si512(const __m512i& x) +{ + return _mm512_xor_si512(x, _mm512_allones_si512()); +} + +KFR_INTRINSIC __m512i _mm512_highbit_epi8() { return _mm512_set1_epi8(static_cast<char>(0x80)); } +KFR_INTRINSIC __m512i _mm512_highbit_epi16() { return _mm512_set1_epi16(static_cast<short>(0x8000)); } +KFR_INTRINSIC __m512i _mm512_highbit_epi32() { return _mm512_set1_epi32(static_cast<int>(0x80000000)); } +KFR_INTRINSIC __m512i _mm512_highbit_epi64() { return _mm512_set1_epi64(0x8000000000000000ll); } + +KFR_INTRINSIC f32avx512 eq(const f32avx512& x, const f32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_EQ_OQ))); +} +KFR_INTRINSIC f64avx512 eq(const f64avx512& x, const f64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_EQ_OQ))); +} +KFR_INTRINSIC f32avx512 ne(const f32avx512& x, const f32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_NEQ_OQ))); +} +KFR_INTRINSIC f64avx512 ne(const f64avx512& x, const f64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_NEQ_OQ))); +} +KFR_INTRINSIC f32avx512 lt(const f32avx512& x, const f32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_LT_OQ))); +} +KFR_INTRINSIC f64avx512 lt(const f64avx512& x, const f64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_LT_OQ))); +} +KFR_INTRINSIC f32avx512 gt(const f32avx512& x, const f32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_GT_OQ))); +} +KFR_INTRINSIC f64avx512 gt(const f64avx512& x, const f64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_GT_OQ))); +} +KFR_INTRINSIC f32avx512 le(const f32avx512& x, const f32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_LE_OQ))); +} +KFR_INTRINSIC f64avx512 le(const f64avx512& x, const f64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_LE_OQ))); +} +KFR_INTRINSIC f32avx512 ge(const f32avx512& x, const f32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_GE_OQ))); +} +KFR_INTRINSIC f64avx512 ge(const f64avx512& x, const f64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_GE_OQ))); +} + +KFR_INTRINSIC f32avx512 band(const f32avx512& x, const f32avx512& y) { return _mm512_and_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 band(const f64avx512& x, const f64avx512& y) { return _mm512_and_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 bor(const f32avx512& x, const f32avx512& y) { return _mm512_or_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 bor(const f64avx512& x, const f64avx512& y) { return _mm512_or_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 bxor(const f32avx512& x, const f32avx512& y) { return _mm512_xor_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 bxor(const f64avx512& x, const f64avx512& y) { return _mm512_xor_pd(x.v, y.v); } + +#if 1 +#define KFR_knot_mask8(x) ((__mmask8)(~((u8)(x)))) +#define KFR_knot_mask16(x) ((__mmask16)(~((u16)(x)))) +#define KFR_knot_mask32(x) ((__mmask32)(~((u32)(x)))) +#define KFR_knot_mask64(x) ((__mmask64)(~((u64)(x)))) +#else +#define KFR_knot_mask8(x) _knot_mask8(x) +#define KFR_knot_mask16(x) _knot_mask16(x) +#define KFR_knot_mask32(x) _knot_mask32(x) +#define KFR_knot_mask64(x) _knot_mask64(x) +#endif + +KFR_INTRINSIC i8avx512 eq(const i8avx512& x, const i8avx512& y) +{ + return _mm512_movm_epi8(_mm512_cmpeq_epi8_mask(x.v, y.v)); +} +KFR_INTRINSIC i16avx512 eq(const i16avx512& x, const i16avx512& y) +{ + return _mm512_movm_epi16(_mm512_cmpeq_epi16_mask(x.v, y.v)); +} +KFR_INTRINSIC i32avx512 eq(const i32avx512& x, const i32avx512& y) +{ + return _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(x.v, y.v)); +} +KFR_INTRINSIC i64avx512 eq(const i64avx512& x, const i64avx512& y) +{ + return _mm512_movm_epi64(_mm512_cmpeq_epi64_mask(x.v, y.v)); +} +KFR_INTRINSIC i8avx512 ne(const i8avx512& x, const i8avx512& y) +{ + return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmpeq_epi8_mask(x.v, y.v))); +} +KFR_INTRINSIC i16avx512 ne(const i16avx512& x, const i16avx512& y) +{ + return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmpeq_epi16_mask(x.v, y.v))); +} +KFR_INTRINSIC i32avx512 ne(const i32avx512& x, const i32avx512& y) +{ + return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmpeq_epi32_mask(x.v, y.v))); +} +KFR_INTRINSIC i64avx512 ne(const i64avx512& x, const i64avx512& y) +{ + return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmpeq_epi64_mask(x.v, y.v))); +} +KFR_INTRINSIC i8avx512 ge(const i8avx512& x, const i8avx512& y) +{ + return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epi8_mask(x.v, y.v))); +} +KFR_INTRINSIC i16avx512 ge(const i16avx512& x, const i16avx512& y) +{ + return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epi16_mask(x.v, y.v))); +} +KFR_INTRINSIC i32avx512 ge(const i32avx512& x, const i32avx512& y) +{ + return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epi32_mask(x.v, y.v))); +} +KFR_INTRINSIC i64avx512 ge(const i64avx512& x, const i64avx512& y) +{ + return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epi64_mask(x.v, y.v))); +} +KFR_INTRINSIC i8avx512 lt(const i8avx512& x, const i8avx512& y) +{ + return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(x.v, y.v)); +} +KFR_INTRINSIC i16avx512 lt(const i16avx512& x, const i16avx512& y) +{ + return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(x.v, y.v)); +} +KFR_INTRINSIC i32avx512 lt(const i32avx512& x, const i32avx512& y) +{ + return _mm512_movm_epi32(_mm512_cmplt_epi32_mask(x.v, y.v)); +} +KFR_INTRINSIC i64avx512 lt(const i64avx512& x, const i64avx512& y) +{ + return _mm512_movm_epi64(_mm512_cmplt_epi64_mask(x.v, y.v)); +} +KFR_INTRINSIC i8avx512 le(const i8avx512& x, const i8avx512& y) +{ + return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epi8_mask(y.v, x.v))); +} +KFR_INTRINSIC i16avx512 le(const i16avx512& x, const i16avx512& y) +{ + return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epi16_mask(y.v, x.v))); +} +KFR_INTRINSIC i32avx512 le(const i32avx512& x, const i32avx512& y) +{ + return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epi32_mask(y.v, x.v))); +} +KFR_INTRINSIC i64avx512 le(const i64avx512& x, const i64avx512& y) +{ + return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epi64_mask(y.v, x.v))); +} +KFR_INTRINSIC i8avx512 gt(const i8avx512& x, const i8avx512& y) +{ + return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(y.v, x.v)); +} +KFR_INTRINSIC i16avx512 gt(const i16avx512& x, const i16avx512& y) +{ + return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(y.v, x.v)); +} +KFR_INTRINSIC i32avx512 gt(const i32avx512& x, const i32avx512& y) +{ + return _mm512_movm_epi32(_mm512_cmplt_epi32_mask(y.v, x.v)); +} +KFR_INTRINSIC i64avx512 gt(const i64avx512& x, const i64avx512& y) +{ + return _mm512_movm_epi64(_mm512_cmplt_epi64_mask(y.v, x.v)); +} + +KFR_INTRINSIC u8avx512 eq(const u8avx512& x, const u8avx512& y) +{ + return _mm512_movm_epi8(_mm512_cmpeq_epu8_mask(x.v, y.v)); +} +KFR_INTRINSIC u16avx512 eq(const u16avx512& x, const u16avx512& y) +{ + return _mm512_movm_epi16(_mm512_cmpeq_epu16_mask(x.v, y.v)); +} +KFR_INTRINSIC u32avx512 eq(const u32avx512& x, const u32avx512& y) +{ + return _mm512_movm_epi32(_mm512_cmpeq_epu32_mask(x.v, y.v)); +} +KFR_INTRINSIC u64avx512 eq(const u64avx512& x, const u64avx512& y) +{ + return _mm512_movm_epi64(_mm512_cmpeq_epu64_mask(x.v, y.v)); +} +KFR_INTRINSIC u8avx512 ne(const u8avx512& x, const u8avx512& y) +{ + return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmpeq_epu8_mask(x.v, y.v))); +} +KFR_INTRINSIC u16avx512 ne(const u16avx512& x, const u16avx512& y) +{ + return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmpeq_epu16_mask(x.v, y.v))); +} +KFR_INTRINSIC u32avx512 ne(const u32avx512& x, const u32avx512& y) +{ + return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmpeq_epu32_mask(x.v, y.v))); +} +KFR_INTRINSIC u64avx512 ne(const u64avx512& x, const u64avx512& y) +{ + return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmpeq_epu64_mask(x.v, y.v))); +} +KFR_INTRINSIC u8avx512 ge(const u8avx512& x, const u8avx512& y) +{ + return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epu8_mask(x.v, y.v))); +} +KFR_INTRINSIC u16avx512 ge(const u16avx512& x, const u16avx512& y) +{ + return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epu16_mask(x.v, y.v))); +} +KFR_INTRINSIC u32avx512 ge(const u32avx512& x, const u32avx512& y) +{ + return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epu32_mask(x.v, y.v))); +} +KFR_INTRINSIC u64avx512 ge(const u64avx512& x, const u64avx512& y) +{ + return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epu64_mask(x.v, y.v))); +} +KFR_INTRINSIC u8avx512 lt(const u8avx512& x, const u8avx512& y) +{ + return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(x.v, y.v)); +} +KFR_INTRINSIC u16avx512 lt(const u16avx512& x, const u16avx512& y) +{ + return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(x.v, y.v)); +} +KFR_INTRINSIC u32avx512 lt(const u32avx512& x, const u32avx512& y) +{ + return _mm512_movm_epi32(_mm512_cmplt_epu32_mask(x.v, y.v)); +} +KFR_INTRINSIC u64avx512 lt(const u64avx512& x, const u64avx512& y) +{ + return _mm512_movm_epi64(_mm512_cmplt_epu64_mask(x.v, y.v)); +} +KFR_INTRINSIC u8avx512 le(const u8avx512& x, const u8avx512& y) +{ + return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epu8_mask(y.v, x.v))); +} +KFR_INTRINSIC u16avx512 le(const u16avx512& x, const u16avx512& y) +{ + return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epu16_mask(y.v, x.v))); +} +KFR_INTRINSIC u32avx512 le(const u32avx512& x, const u32avx512& y) +{ + return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epu32_mask(y.v, x.v))); +} +KFR_INTRINSIC u64avx512 le(const u64avx512& x, const u64avx512& y) +{ + return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epu64_mask(y.v, x.v))); +} +KFR_INTRINSIC u8avx512 gt(const u8avx512& x, const u8avx512& y) +{ + return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(y.v, x.v)); +} +KFR_INTRINSIC u16avx512 gt(const u16avx512& x, const u16avx512& y) +{ + return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(y.v, x.v)); +} +KFR_INTRINSIC u32avx512 gt(const u32avx512& x, const u32avx512& y) +{ + return _mm512_movm_epi32(_mm512_cmplt_epu32_mask(y.v, x.v)); +} +KFR_INTRINSIC u64avx512 gt(const u64avx512& x, const u64avx512& y) +{ + return _mm512_movm_epi64(_mm512_cmplt_epu64_mask(y.v, x.v)); +} + +KFR_INTRINSIC i8avx512 add(const i8avx512& x, const i8avx512& y) { return _mm512_add_epi8(x.v, y.v); } +KFR_INTRINSIC i16avx512 add(const i16avx512& x, const i16avx512& y) { return _mm512_add_epi16(x.v, y.v); } +KFR_INTRINSIC i32avx512 add(const i32avx512& x, const i32avx512& y) { return _mm512_add_epi32(x.v, y.v); } +KFR_INTRINSIC i64avx512 add(const i64avx512& x, const i64avx512& y) { return _mm512_add_epi64(x.v, y.v); } +KFR_INTRINSIC u8avx512 add(const u8avx512& x, const u8avx512& y) { return _mm512_add_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 add(const u16avx512& x, const u16avx512& y) { return _mm512_add_epi16(x.v, y.v); } +KFR_INTRINSIC u32avx512 add(const u32avx512& x, const u32avx512& y) { return _mm512_add_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx512 add(const u64avx512& x, const u64avx512& y) { return _mm512_add_epi64(x.v, y.v); } + +KFR_INTRINSIC i8avx512 sub(const i8avx512& x, const i8avx512& y) { return _mm512_sub_epi8(x.v, y.v); } +KFR_INTRINSIC i16avx512 sub(const i16avx512& x, const i16avx512& y) { return _mm512_sub_epi16(x.v, y.v); } +KFR_INTRINSIC i32avx512 sub(const i32avx512& x, const i32avx512& y) { return _mm512_sub_epi32(x.v, y.v); } +KFR_INTRINSIC i64avx512 sub(const i64avx512& x, const i64avx512& y) { return _mm512_sub_epi64(x.v, y.v); } +KFR_INTRINSIC u8avx512 sub(const u8avx512& x, const u8avx512& y) { return _mm512_sub_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 sub(const u16avx512& x, const u16avx512& y) { return _mm512_sub_epi16(x.v, y.v); } +KFR_INTRINSIC u32avx512 sub(const u32avx512& x, const u32avx512& y) { return _mm512_sub_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx512 sub(const u64avx512& x, const u64avx512& y) { return _mm512_sub_epi64(x.v, y.v); } + +KFR_INTRINSIC __m512i mul_epi8(const __m512i& x, const __m512i& y) +{ + const __m512i even = _mm512_mullo_epi16(x, y); + const __m512i odd = _mm512_mullo_epi16(_mm512_srli_epi16(x, 8), _mm512_srli_epi16(y, 8)); + return _mm512_or_si512(_mm512_slli_epi16(odd, 8), _mm512_srli_epi16(_mm512_slli_epi16(even, 8), 8)); +} + +KFR_INTRINSIC i8avx512 mul(const i8avx512& x, const i8avx512& y) { return mul_epi8(x.v, y.v); } +KFR_INTRINSIC i16avx512 mul(const i16avx512& x, const i16avx512& y) { return _mm512_mullo_epi16(x.v, y.v); } +KFR_INTRINSIC i32avx512 mul(const i32avx512& x, const i32avx512& y) { return _mm512_mullo_epi32(x.v, y.v); } +KFR_INTRINSIC i64avx512 mul(const i64avx512& x, const i64avx512& y) { return _mm512_mullo_epi64(x.v, y.v); } +KFR_INTRINSIC u8avx512 mul(const u8avx512& x, const u8avx512& y) { return mul_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 mul(const u16avx512& x, const u16avx512& y) { return _mm512_mullo_epi16(x.v, y.v); } +KFR_INTRINSIC u32avx512 mul(const u32avx512& x, const u32avx512& y) { return _mm512_mullo_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx512 mul(const u64avx512& x, const u64avx512& y) { return _mm512_mullo_epi64(x.v, y.v); } + +KFR_INTRINSIC i8avx512 div(const i8avx512& x, const i8avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u8avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC i16avx512 div(const i16avx512& x, const i16avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u16avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC i32avx512 div(const i32avx512& x, const i32avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u32avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC i64avx512 div(const i64avx512& x, const i64avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC u8avx512 div(const u8avx512& x, const u8avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u8avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC u16avx512 div(const u16avx512& x, const u16avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u16avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC u32avx512 div(const u32avx512& x, const u32avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u32avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} +KFR_INTRINSIC u64avx512 div(const u64avx512& x, const u64avx512& y) +{ + KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = y[i] ? x[i] / y[i] : 0); +} + +KFR_INTRINSIC i8avx512 band(const i8avx512& x, const i8avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC i16avx512 band(const i16avx512& x, const i16avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC i32avx512 band(const i32avx512& x, const i32avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC i64avx512 band(const i64avx512& x, const i64avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC u8avx512 band(const u8avx512& x, const u8avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC u16avx512 band(const u16avx512& x, const u16avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC u32avx512 band(const u32avx512& x, const u32avx512& y) { return _mm512_and_si512(x.v, y.v); } +KFR_INTRINSIC u64avx512 band(const u64avx512& x, const u64avx512& y) { return _mm512_and_si512(x.v, y.v); } + +KFR_INTRINSIC i8avx512 bor(const i8avx512& x, const i8avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC i16avx512 bor(const i16avx512& x, const i16avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC i32avx512 bor(const i32avx512& x, const i32avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC i64avx512 bor(const i64avx512& x, const i64avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC u8avx512 bor(const u8avx512& x, const u8avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC u16avx512 bor(const u16avx512& x, const u16avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC u32avx512 bor(const u32avx512& x, const u32avx512& y) { return _mm512_or_si512(x.v, y.v); } +KFR_INTRINSIC u64avx512 bor(const u64avx512& x, const u64avx512& y) { return _mm512_or_si512(x.v, y.v); } + +KFR_INTRINSIC i8avx512 bxor(const i8avx512& x, const i8avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC i16avx512 bxor(const i16avx512& x, const i16avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC i32avx512 bxor(const i32avx512& x, const i32avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC i64avx512 bxor(const i64avx512& x, const i64avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC u8avx512 bxor(const u8avx512& x, const u8avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC u16avx512 bxor(const u16avx512& x, const u16avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC u32avx512 bxor(const u32avx512& x, const u32avx512& y) { return _mm512_xor_si512(x.v, y.v); } +KFR_INTRINSIC u64avx512 bxor(const u64avx512& x, const u64avx512& y) { return _mm512_xor_si512(x.v, y.v); } + +KFR_INTRINSIC f32avx512 shl(const f32avx512& x, unsigned y) +{ + return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(x.v), y)); +} +KFR_INTRINSIC f64avx512 shl(const f64avx512& x, unsigned y) +{ + return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(x.v), y)); +} +KFR_INTRINSIC f32avx512 shr(const f32avx512& x, unsigned y) +{ + return _mm512_castsi512_ps(_mm512_srli_epi32(_mm512_castps_si512(x.v), y)); +} +KFR_INTRINSIC f64avx512 shr(const f64avx512& x, unsigned y) +{ + return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(x.v), y)); +} + +KFR_INTRINSIC u16avx512 shl(const u16avx512& x, unsigned y) { return _mm512_slli_epi16(x.v, y); } +KFR_INTRINSIC u32avx512 shl(const u32avx512& x, unsigned y) { return _mm512_slli_epi32(x.v, y); } +KFR_INTRINSIC i16avx512 shl(const i16avx512& x, unsigned y) { return _mm512_slli_epi16(x.v, y); } +KFR_INTRINSIC i32avx512 shl(const i32avx512& x, unsigned y) { return _mm512_slli_epi32(x.v, y); } +KFR_INTRINSIC u16avx512 shr(const u16avx512& x, unsigned y) { return _mm512_srli_epi16(x.v, y); } +KFR_INTRINSIC u32avx512 shr(const u32avx512& x, unsigned y) { return _mm512_srli_epi32(x.v, y); } +KFR_INTRINSIC i16avx512 shr(const i16avx512& x, unsigned y) { return _mm512_srai_epi16(x.v, y); } +KFR_INTRINSIC i32avx512 shr(const i32avx512& x, unsigned y) { return _mm512_srai_epi32(x.v, y); } + +KFR_INTRINSIC u64avx512 shl(const u64avx512& x, unsigned y) { return _mm512_slli_epi64(x.v, y); } +KFR_INTRINSIC u64avx512 shr(const u64avx512& x, unsigned y) { return _mm512_srli_epi64(x.v, y); } +KFR_INTRINSIC i64avx512 shl(const i64avx512& x, unsigned y) { return _mm512_slli_epi64(x.v, y); } +KFR_INTRINSIC i64avx512 shr(const i64avx512& x, unsigned y) +{ + KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = x[i] >> y); +} + +KFR_INTRINSIC u8avx512 shl(const u8avx512& x, unsigned y) +{ + __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v); + __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v); + __m512i ll = _mm512_slli_epi16(l, y); + __m512i hh = _mm512_slli_epi16(h, y); + + return _mm512_packs_epi16(ll, hh); +} +KFR_INTRINSIC i8avx512 shl(const i8avx512& x, unsigned y) +{ + __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v); + __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v); + __m512i ll = _mm512_slli_epi16(l, y); + __m512i hh = _mm512_slli_epi16(h, y); + + return _mm512_packs_epi16(ll, hh); +} +KFR_INTRINSIC u8avx512 shr(const u8avx512& x, unsigned y) +{ + __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v); + __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v); + __m512i ll = _mm512_srli_epi16(l, y); + __m512i hh = _mm512_srli_epi16(h, y); + + return _mm512_packs_epi16(ll, hh); +} +KFR_INTRINSIC i8avx512 shr(const i8avx512& x, unsigned y) +{ + __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v); + __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v); + __m512i ll = _mm512_srai_epi16(l, y); + __m512i hh = _mm512_srai_epi16(h, y); + + return _mm512_packs_epi16(ll, hh); +} + +KFR_INTRINSIC u32avx512 shl(const u32avx512& x, const u32avx512& y) { return _mm512_sllv_epi32(x.v, y.v); } +KFR_INTRINSIC i32avx512 shl(const i32avx512& x, const u32avx512& y) { return _mm512_sllv_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx512 shl(const u64avx512& x, const u64avx512& y) { return _mm512_sllv_epi64(x.v, y.v); } +KFR_INTRINSIC i64avx512 shl(const i64avx512& x, const u64avx512& y) { return _mm512_sllv_epi64(x.v, y.v); } + +KFR_INTRINSIC u32avx512 shr(const u32avx512& x, const u32avx512& y) { return _mm512_srlv_epi32(x.v, y.v); } +KFR_INTRINSIC i32avx512 shr(const i32avx512& x, const u32avx512& y) { return _mm512_srav_epi32(x.v, y.v); } +KFR_INTRINSIC u64avx512 shr(const u64avx512& x, const u64avx512& y) { return _mm512_srlv_epi64(x.v, y.v); } +KFR_INTRINSIC i64avx512 shr(const i64avx512& x, const u64avx512& y) { return _mm512_srav_epi64(x.v, y.v); } + +KFR_INTRINSIC f32avx512 shl(const f32avx512& x, const u32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_sllv_epi32(_mm512_castps_si512(x.v), y.v)); +} +KFR_INTRINSIC f64avx512 shl(const f64avx512& x, const u64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(x.v), y.v)); +} +KFR_INTRINSIC f32avx512 shr(const f32avx512& x, const u32avx512& y) +{ + return _mm512_castsi512_ps(_mm512_srlv_epi32(_mm512_castps_si512(x.v), y.v)); +} +KFR_INTRINSIC f64avx512 shr(const f64avx512& x, const u64avx512& y) +{ + return _mm512_castsi512_pd(_mm512_srlv_epi64(_mm512_castpd_si512(x.v), y.v)); +} + +#endif + +#endif + +#endif + +#define KFR_HANDLE_ALL_SIZES_SHIFT_2(fn) \ + template <typename T, size_t N, \ + KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const unsigned b) \ + { \ + return slice<0, N>(fn(expand_simd(a), b)); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \ + typename = void> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const unsigned b) \ + { \ + return concat(fn(low(a), b), fn(high(a), b)); \ + } +#define KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(fn) \ + template <typename T, size_t N, \ + KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<utype<T>, N>& b) \ + { \ + return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \ + typename = void> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<utype<T>, N>& b) \ + { \ + return concat(fn(low(a), low(b)), fn(high(a), high(b))); \ + } + +KFR_HANDLE_ALL_SIZES_2(add) +KFR_HANDLE_ALL_SIZES_2(sub) +KFR_HANDLE_ALL_SIZES_2(mul) +KFR_HANDLE_ALL_SIZES_2(div) + +KFR_HANDLE_ALL_SIZES_2(eq) +KFR_HANDLE_ALL_SIZES_2(ne) +KFR_HANDLE_ALL_SIZES_2(lt) +KFR_HANDLE_ALL_SIZES_2(gt) +KFR_HANDLE_ALL_SIZES_2(le) +KFR_HANDLE_ALL_SIZES_2(ge) + +KFR_HANDLE_ALL_SIZES_2(band) +KFR_HANDLE_ALL_SIZES_2(bor) +KFR_HANDLE_ALL_SIZES_2(bxor) + +KFR_HANDLE_ALL_SIZES_SHIFT_2(shl) +KFR_HANDLE_ALL_SIZES_SHIFT_2(shr) +KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(shl) +KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(shr) + +#else + +template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y[i]))); +} +template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, unsigned y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y))); +} +template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y[i]))); +} +template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y))); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] == y[i])); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] != y[i])); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] >= y[i])); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] <= y[i])); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] > y[i])); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] < y[i])); +} + +template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> bor(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>((ubitcast(x[i]) | ubitcast(y[i]))))); +} +template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> bxor(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>(ubitcast(x[i]) ^ ubitcast(y[i])))); +} +template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> band(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>(ubitcast(x[i]) & ubitcast(y[i])))); +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> add(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = x[i] + y[i]); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> sub(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = x[i] - y[i]); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> mul(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = x[i] * y[i]); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> +KFR_INTRINSIC vec<T, N> div(const vec<T, N>& x, const vec<T, N>& y) +{ + KFR_COMPONENTWISE_RET(result[i] = x[i] / y[i]); +} + +#define KFR_HANDLE_VEC_SCA(fn) \ + template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& x, const T& y) \ + { \ + return fn(x, vec<T, N>(y)); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const T& x, const vec<T, N>& y) \ + { \ + return fn(vec<T, N>(x), y); \ + } + +KFR_HANDLE_VEC_SCA(add) +KFR_HANDLE_VEC_SCA(sub) +KFR_HANDLE_VEC_SCA(mul) +KFR_HANDLE_VEC_SCA(div) +KFR_HANDLE_VEC_SCA(band) +KFR_HANDLE_VEC_SCA(bor) +KFR_HANDLE_VEC_SCA(bxor) +KFR_HANDLE_VEC_SCA(eq) +KFR_HANDLE_VEC_SCA(ne) +KFR_HANDLE_VEC_SCA(lt) +KFR_HANDLE_VEC_SCA(gt) +KFR_HANDLE_VEC_SCA(le) +KFR_HANDLE_VEC_SCA(ge) + +#endif + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x) +{ + return bxor(special_constants<T>::allones(), x); +} + +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x) +{ + return sub(T(0), x); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x) +{ + return bxor(special_constants<T>::highbitmask(), x); +} + +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/simd/impl/function.hpp b/include/kfr/simd/impl/function.hpp @@ -0,0 +1,295 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../../base/expression.hpp" +#include "../shuffle.hpp" +#include "../types.hpp" +#include "../vec.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +#define KFR_HANDLE_NOT_F_1(fn) \ + template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> \ + KFR_INTRINSIC vec<flt_type<T>, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \ + { \ + return intrinsics::fn(elemcast<flt_type<T>>(a)); \ + } + +#define KFR_HANDLE_SCALAR(fn) \ + template <typename T1, typename... Args, typename Tout = ::kfr::common_type<T1, Args...>, \ + KFR_ENABLE_IF(!or_t<is_vec<T1>, is_vec<Args>...>::value)> \ + KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \ + { \ + using vecout = vec1<Tout>; \ + return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ + } + +#define KFR_HANDLE_SCALAR_1_T(fn, Tout) \ + template <typename T1, typename... Args, typename T = ::kfr::common_type<T1, Args...>, \ + KFR_ENABLE_IF(!or_t<is_vec<T1>, is_vec<Args>...>::value)> \ + KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \ + { \ + using vecout = vec1<Tout>; \ + return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ + } + +#define KFR_HANDLE_ARGS_T(fn, Tout) \ + template <typename T1, typename... Args, typename T = ::kfr::common_type<T1, Args...>, \ + KFR_ENABLE_IF(or_t<is_vec<T1>, is_vec<Args>...>::value)> \ + KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \ + { \ + using vecout = vec1<Tout>; \ + return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ + } + +namespace intrinsics +{ +#ifdef CMT_ARCH_X86 +using f32sse = vec<f32, 4>; +using f64sse = vec<f64, 2>; +using i8sse = vec<i8, 16>; +using i16sse = vec<i16, 8>; +using i32sse = vec<i32, 4>; +using i64sse = vec<i64, 2>; +using u8sse = vec<u8, 16>; +using u16sse = vec<u16, 8>; +using u32sse = vec<u32, 4>; +using u64sse = vec<u64, 2>; + +using f32avx = vec<f32, 8>; +using f64avx = vec<f64, 4>; +using i8avx = vec<i8, 32>; +using i16avx = vec<i16, 16>; +using i32avx = vec<i32, 8>; +using i64avx = vec<i64, 4>; +using u8avx = vec<u8, 32>; +using u16avx = vec<u16, 16>; +using u32avx = vec<u32, 8>; +using u64avx = vec<u64, 4>; + +using f32avx512 = vec<f32, 16>; +using f64avx512 = vec<f64, 8>; +using i8avx512 = vec<i8, 64>; +using i16avx512 = vec<i16, 32>; +using i32avx512 = vec<i32, 16>; +using i64avx512 = vec<i64, 8>; +using u8avx512 = vec<u8, 64>; +using u16avx512 = vec<u16, 32>; +using u32avx512 = vec<u32, 16>; +using u64avx512 = vec<u64, 8>; + +#else +using f32neon = vec<f32, 4>; +using f64neon = vec<f64, 2>; +using i8neon = vec<i8, 16>; +using i16neon = vec<i16, 8>; +using i32neon = vec<i32, 4>; +using i64neon = vec<i64, 2>; +using u8neon = vec<u8, 16>; +using u16neon = vec<u16, 8>; +using u32neon = vec<u32, 4>; +using u64neon = vec<u64, 2>; +#endif + +template <typename T> +constexpr inline size_t next_simd_width(size_t n) CMT_NOEXCEPT +{ + return n < minimum_vector_width<T> ? minimum_vector_width<T> : next_poweroftwo(n); +} + +template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)> +KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, 1>& x) CMT_NOEXCEPT +{ + return broadcast<Nout>(x); +} + +template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)> +KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x) CMT_NOEXCEPT +{ + return extend<Nout>(x); +} + +template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)> +KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) CMT_NOEXCEPT +{ + return widen<Nout>(x, value); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, + Fn&& fn) +{ + result = fn(a, b, c); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, + Fn&& fn) +{ + intrin(result.h.low, a.h.low, b.h.low, c.h.low, fn); + intrin(result.h.high, a.h.high, b.h.high, c.h.high, fn); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn) +{ + result = fn(a); +} + +template <typename T, size_t Nvec = vector_width<T>, size_t N, typename Fn, KFR_ENABLE_IF(N > Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn) +{ + intrin(result.h.low, a.h.low, fn); + intrin(result.h.high, a.h.high, fn); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn) +{ + result = fn(a, b); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn) +{ + intrin(result.h.low, a.h.low, b.h.low, fn); + intrin(result.h.high, a.h.high, b.h.high, fn); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn) +{ + result = fn(a, b); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn) +{ + intrin(result.h.low, a.h.low, b, fn); + intrin(result.h.high, a.h.high, b, fn); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn) +{ + result = fn(a, b); +} + +template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> +KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn) +{ + intrin(result.h.low, a, b.h.low, fn); + intrin(result.h.high, a, b.h.high, fn); +} + +#define KFR_HANDLE_ALL_SIZES_1_IF(fn, cond) \ + template <typename T, size_t N, \ + KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value && cond)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \ + { \ + constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ + return intrinsics::fn(a.shuffle(csizeseq<Nout>)).shuffle(csizeseq<N>); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value && cond), \ + typename = void> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \ + { \ + vec<T, N> r; \ + intrin(r, a, [](const auto& x) { return intrinsics::fn(x); }); \ + return r; \ + } + +#define KFR_HANDLE_ALL_SIZES_1(fn) KFR_HANDLE_ALL_SIZES_1_IF(fn, true) + +#define KFR_HANDLE_ALL_SIZES_2(fn) \ + template <typename T, size_t N, \ + KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT \ + { \ + constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ + return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), b.shuffle(csizeseq_t<Nout>())) \ + .shuffle(csizeseq<N>); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \ + typename = void> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT \ + { \ + vec<T, N> r; \ + intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); }); \ + return r; \ + } \ + template <typename T, size_t N, \ + KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT \ + { \ + constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ + return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), vec<T, Nout>(b)).shuffle(csizeseq<N>); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \ + typename = void> \ + KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT \ + { \ + vec<T, N> r; \ + intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); }); \ + return r; \ + } \ + template <typename T, size_t N, \ + KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \ + KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT \ + { \ + constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ + return intrinsics::fn(vec<T, Nout>(a), b.shuffle(csizeseq_t<Nout>())).shuffle(csizeseq<N>); \ + } \ + template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \ + typename = void> \ + KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT \ + { \ + vec<T, N> r; \ + intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); }); \ + return r; \ + } + +template <typename T> +using vec1 = conditional<is_vec<T>::value, T, vec<T, 1>>; + +template <typename T> +inline const T& to_scalar(const T& value) CMT_NOEXCEPT +{ + return value; +} +template <typename T> +inline T to_scalar(const vec<T, 1>& value) CMT_NOEXCEPT +{ + return value[0]; +} +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/simd/impl/intrinsics.h b/include/kfr/simd/impl/intrinsics.h @@ -0,0 +1,50 @@ +#pragma once + +#include "../../cident.h" +#include <math.h> +#include <stdlib.h> +#include <string.h> + +#ifdef CMT_ARCH_SSE2 +#include <immintrin.h> +#ifdef CMT_OS_WIN +#include <intrin.h> +#endif +#endif + +#ifdef CMT_ARCH_NEON +#include <arm_neon.h> +#endif + +#if defined CMT_COMPILER_GCC && defined CMT_ARCH_X86 +#include <x86intrin.h> +#endif + +#ifdef CMT_COMPILER_CLANG +#define builtin_addressof(x) __builtin_addressof(x) +#else +template <class T> +inline T* builtin_addressof(T& arg) +{ + return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg))); +} +#endif + +#ifdef CMT_COMPILER_GNU +CMT_INLINE float builtin_sqrt(float x) { return __builtin_sqrtf(x); } +CMT_INLINE double builtin_sqrt(double x) { return __builtin_sqrt(x); } +CMT_INLINE long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); } +CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) +{ + __builtin_memcpy(dest, src, size); +} +CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); } +#else +CMT_INLINE float builtin_sqrt(float x) { return ::sqrtf(x); } +CMT_INLINE double builtin_sqrt(double x) { return ::sqrt(x); } +CMT_INLINE long double builtin_sqrt(long double x) { return ::sqrtl(x); } +CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) { ::memcpy(dest, src, size); } +CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { ::memset(dest, val, size); } +#endif + +#define KFR_ENABLE_IF CMT_ENABLE_IF diff --git a/include/kfr/simd/impl/operators.hpp b/include/kfr/simd/impl/operators.hpp @@ -0,0 +1,164 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "function.hpp" + +#ifdef CMT_CLANG_EXT +#include "basicoperators_clang.hpp" +#else +#include "basicoperators_generic.hpp" +#endif + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> neg(const vec<complex<T>, N>& x) +{ + return neg(x.flatten()).v; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> add(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + return add(x.flatten(), y.flatten()).v; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> sub(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + return sub(x.flatten(), y.flatten()).v; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> mul(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + const vec<T, (N * 2)> xx = x.v; + const vec<T, (N * 2)> yy = y.v; + return subadd(mul(xx, dupeven(yy)), mul(swap<2>(xx), dupodd(yy))).v; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> div(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + const vec<T, (N * 2)> xx = x.v; + const vec<T, (N * 2)> yy = y.v; + const vec<T, (N * 2)> m = (add(sqr(dupeven(yy)), sqr(dupodd(yy)))); + return swap<2>(subadd(mul(swap<2>(xx), dupeven(yy)), mul(xx, dupodd(yy))) / m).v; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> bor(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + return bor(x.flatten(), y.flatten()).v; +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> bxor(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + return bxor(x.flatten(), y.flatten()).v; +} +template <typename T, size_t N> +KFR_INTRINSIC vec<complex<T>, N> band(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y) +{ + return band(x.flatten(), y.flatten()).v; +} + +#define KFR_COMPLEX_OP_CVT(fn) \ + template <typename T, size_t N> \ + KFR_INTRINSIC vec<complex<T>, N> fn(const vec<complex<T>, N>& x, const complex<T>& y) \ + { \ + return fn(x, vec<complex<T>, N>(y)); \ + } \ + template <typename T, size_t N> \ + KFR_INTRINSIC vec<complex<T>, N> fn(const complex<T>& x, const vec<complex<T>, N>& y) \ + { \ + return fn(vec<complex<T>, N>(x), y); \ + } + +KFR_COMPLEX_OP_CVT(mul) +KFR_COMPLEX_OP_CVT(div) +KFR_COMPLEX_OP_CVT(band) +KFR_COMPLEX_OP_CVT(bxor) +KFR_COMPLEX_OP_CVT(bor) + +#define KFR_VECVEC_OP1(fn) \ + template <typename T1, size_t N1, size_t N2> \ + KFR_INTRINSIC vec<vec<T1, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x) \ + { \ + return fn(x.flatten()).v; \ + } + +#define KFR_VECVEC_OP2(fn) \ + template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \ + KFR_ENABLE_IF(is_simd_type<C>::value)> \ + KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const vec<vec<T2, N1>, N2>& y) \ + { \ + return fn(innercast<C>(x.flatten()), innercast<C>(y.flatten())).v; \ + } \ + template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \ + KFR_ENABLE_IF(is_simd_type<C>::value)> \ + KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const T2& y) \ + { \ + return fn(innercast<C>(x.flatten()), innercast<C>(y)).v; \ + } \ + template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \ + KFR_ENABLE_IF(is_simd_type<C>::value)> \ + KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const vec<T2, N1>& y) \ + { \ + return fn(innercast<C>(x.flatten()), repeat<N2>(innercast<C>(y.flatten()))).v; \ + } \ + template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \ + KFR_ENABLE_IF(is_simd_type<C>::value)> \ + KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const T1& x, const vec<vec<T2, N1>, N2>& y) \ + { \ + return fn(innercast<C>(x), innercast<C>(y.flatten())).v; \ + } \ + template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \ + KFR_ENABLE_IF(is_simd_type<C>::value)> \ + KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<T1, N1>& x, const vec<vec<T2, N1>, N2>& y) \ + { \ + return fn(repeat<N2>(innercast<C>(x.flatten())), innercast<C>(y.flatten())).v; \ + } + +KFR_VECVEC_OP1(neg) +KFR_VECVEC_OP1(bnot) +KFR_VECVEC_OP2(add) +KFR_VECVEC_OP2(sub) +KFR_VECVEC_OP2(mul) +KFR_VECVEC_OP2(div) +KFR_VECVEC_OP2(band) +KFR_VECVEC_OP2(bor) +KFR_VECVEC_OP2(bxor) + +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp @@ -0,0 +1,183 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../platform.hpp" + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ + +#if defined CMT_COMPILER_GNU +constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); } +constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); } +constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); } +constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); } +#elif defined CMT_COMPILER_MSVC +constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); } +constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); } +constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); } +constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); } +#else +inline f32 allones_f32() CMT_NOEXCEPT +{ + return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu))); +} +inline f64 allones_f64() CMT_NOEXCEPT +{ + return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull))); +} +inline f32 invhighbit_f32() CMT_NOEXCEPT +{ + return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu))); +} +inline f64 invhighbit_f64() CMT_NOEXCEPT +{ + return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull))); +} +#endif + +template <typename T> +struct special_scalar_constants +{ + constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); } + constexpr static T allones() { return static_cast<T>(-1ll); } + constexpr static T allzeros() { return T(0); } + constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); } +}; + +#ifndef CMT_COMPILER_INTEL +#define KFR_CONSTEXPR_NON_INTEL constexpr +#else +#define KFR_CONSTEXPR_NON_INTEL +#endif + +template <> +struct special_scalar_constants<float> +{ + constexpr static float highbitmask() { return -0.f; } + KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); }; + constexpr static float allzeros() { return 0.f; } + KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); } +}; + +template <> +struct special_scalar_constants<double> +{ + constexpr static double highbitmask() { return -0.; } + KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); }; + constexpr static double allzeros() { return 0.; } + KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); } +}; + +template <typename T> +struct special_constants : public special_scalar_constants<subtype<T>> +{ +public: + using Tsub = subtype<T>; +}; + +namespace intrinsics +{ + +template <typename T, size_t N> +struct simd_t +{ + using value_type = T; + + constexpr static size_t size() { return N; } +}; + +template <typename T, size_t N1, size_t N2> +struct simd2_t +{ + using value_type = T; + + constexpr static size_t size1() { return N1; } + + constexpr static size_t size2() { return N2; } +}; + +template <typename Tout, typename Tin, size_t N> +struct simd_cvt_t +{ + using value_type_out = Tout; + using value_type_in = Tin; + + constexpr static size_t size() { return N; } +}; + +template <typename T, size_t N> +constexpr size_t alignment() +{ + return const_min(size_t(platform<>::native_vector_alignment), next_poweroftwo(sizeof(T) * N)); +} + +template <typename T, size_t N> +struct alignas(alignment<T, N>()) simd_array +{ + T val[next_poweroftwo(N)]; +}; + +template <typename T, size_t N> +struct simd_type; + +template <typename T> +struct simd_type<T, 0> +{ + // SFINAE +}; + +template <typename T, size_t N> +struct simd_halves +{ + using subtype = typename simd_type<T, prev_poweroftwo(N - 1)>::type; + + subtype low; + subtype high; +#if KFR_DEFINE_CTORS_FOR_HALVES + simd_halves() CMT_NOEXCEPT {} + simd_halves(const subtype& l, const subtype& h) CMT_NOEXCEPT : low(l), high(h) {} + simd_halves(const simd_halves& v) CMT_NOEXCEPT : low(v.low), high(v.high) {} + simd_halves(simd_halves&& v) CMT_NOEXCEPT : low(v.low), high(v.high) {} + + simd_halves& operator=(const simd_halves& v) CMT_NOEXCEPT + { + low = v.low; + high = v.high; + return *this; + } + simd_halves& operator=(simd_halves&& v) CMT_NOEXCEPT + { + low = v.low; + high = v.high; + return *this; + } +#endif +}; + +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/specializations.i b/include/kfr/simd/impl/specializations.i @@ -0,0 +1,116 @@ +/** + * Copyright (C) 2016 D Levin (http://www.kfrlib.com) + * This file is part of KFR + * + * KFR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KFR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with KFR. + */ +#pragma once + +#include "../vec.hpp" +#ifndef KFR_SHUFFLE_SPECIALIZATIONS +#include "../shuffle.hpp" +#endif + +#ifdef KFR_COMPILER_GNU + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ +template <> +inline vec<f32, 32> shufflevector<f32, 32>( + csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, + 15, 22, 23, 30, 31>, + const vec<f32, 32>& x, const vec<f32, 32>&) +{ + f32x32 w = x; + + w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)), + permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w))); + + w = permutegroups<(4), 0, 4, 2, 6, 1, 5, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op + return w; +} + +template <> +inline vec<f32, 32> shufflevector<f32, 32>( + csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22, + 23, 14, 15, 30, 31>, + const vec<f32, 32>& x, const vec<f32, 32>&) +{ + f32x32 w = x; + + w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)), + permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w))); + + w = permutegroups<(4), 0, 4, 1, 5, 2, 6, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op + return w; +} + +inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x) +{ + return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, + 19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>, + x, x); +} + +template <> +inline vec<f32, 64> shufflevector<f32, 64>( + csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13, + 44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38, + 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>, + const vec<f32, 64>& x, const vec<f32, 64>&) +{ + return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>( + concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x)))); +} + +template <> +inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>, + const vec<f32, 16>& x, const vec<f32, 16>&) +{ + // asm volatile("int $3"); + const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x); + + return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)), + shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx))); +} + +template <> +inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>, + const vec<f32, 16>& x, const vec<f32, 16>&) +{ + const vec<f32, 16> xx = + concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); + + return permutegroups<(4), 0, 2, 1, 3>(xx); +} + +template <> +inline vec<f32, 32> shufflevector<f32, 32>( + csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, + 29, 14, 30, 15, 31>, + const vec<f32, 32>& x, const vec<f32, 32>&) +{ + const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x); + + return concat(interleavehalfs(low(xx)), interleavehalfs(high(xx))); +} +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr +#endif diff --git a/include/kfr/simd/mask.hpp b/include/kfr/simd/mask.hpp @@ -0,0 +1,155 @@ +/** @addtogroup logical + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "vec.hpp" + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ + +template <typename T> +using maskfor = typename T::mask_t; + +namespace internal +{ + +template <typename T> +constexpr inline T maskbits(bool value) +{ + return value ? special_constants<T>::allones() : special_constants<T>::allzeros(); +} +} // namespace internal + +template <typename T, size_t N> +struct mask : protected vec<T, N> +{ + using base = vec<T, N>; + + KFR_MEM_INTRINSIC mask() CMT_NOEXCEPT = default; + + KFR_MEM_INTRINSIC mask(const mask&) CMT_NOEXCEPT = default; + + KFR_MEM_INTRINSIC mask& operator=(const mask&) CMT_NOEXCEPT = default; + + using simd_type = typename base::simd_type; + + KFR_MEM_INTRINSIC mask(bool arg) : base(internal::maskbits<T>(arg)) {} + + template <typename... Args> + KFR_MEM_INTRINSIC mask(bool arg1, bool arg2, Args... args) + : base(internal::maskbits<T>(arg1), internal::maskbits<T>(arg2), + internal::maskbits<T>(static_cast<bool>(args))...) + { + } + + using vec<T, N>::v; + + KFR_MEM_INTRINSIC mask(const base& v) CMT_NOEXCEPT; + + KFR_MEM_INTRINSIC mask(const simd_type& simd) : base(simd) {} + + template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))> + KFR_MEM_INTRINSIC mask(const mask<U, N>& m) : base(base::frombits(m.asvec())) + { + } + + template <typename U, KFR_ENABLE_IF(sizeof(T) != sizeof(U))> + KFR_MEM_INTRINSIC mask(const mask<U, N>& m) + : base(base::frombits(innercast<itype<T>>(vec<itype<U>, N>::frombits(m.asvec())))) + { + } + + KFR_MEM_INTRINSIC bool operator[](size_t index) const CMT_NOEXCEPT; + + KFR_MEM_INTRINSIC constexpr base asvec() const CMT_NOEXCEPT { return base(v); } +}; + +namespace internal +{ + +template <typename T, size_t Nout, size_t N1, size_t... indices> +constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>) +{ + return make_vector(maskbits<T>(indices < N1)...); +} + +template <typename T, size_t Nout, size_t N1> +constexpr vec<T, Nout> partial_mask() +{ + return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>()); +} +} // namespace internal + +template <typename T, size_t N> +KFR_MEM_INTRINSIC bool mask<T, N>::operator[](size_t index) const CMT_NOEXCEPT +{ + return ibitcast(base::operator[](index)) < 0; +} + +template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)> +constexpr KFR_INTRINSIC mask<T, Nout> make_mask(bool arg, Args... args) +{ + return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...); +} + +} // namespace CMT_ARCH_NAME +} // namespace kfr + +namespace cometa +{ + +template <typename T, size_t N> +struct compound_type_traits<kfr::mask<T, N>> +{ + using subtype = T; + using deep_subtype = cometa::deep_subtype<T>; + constexpr static size_t width = N; + constexpr static size_t deep_width = width * compound_type_traits<T>::width; + constexpr static bool is_scalar = false; + constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; + template <typename U> + using rebind = kfr::mask<U, N>; + template <typename U> + using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; + + KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::mask<T, N>& value, size_t index) + { + return value[index]; + } +}; +} // namespace cometa + +namespace std +{ +template <typename T1, typename T2, size_t N> +struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>> +{ + using type = kfr::mask<typename common_type<T1, T2>::type, N>; +}; +} // namespace std diff --git a/include/kfr/simd/operators.hpp b/include/kfr/simd/operators.hpp @@ -0,0 +1,810 @@ +/** @addtogroup basic_math + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "impl/operators.hpp" +#include "mask.hpp" +#include <algorithm> +#include <utility> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +#define KFR_VEC_OPERATOR1(op, fn) \ + template <typename T, size_t N> \ + constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x) \ + { \ + return intrinsics::fn(x); \ + } + +#define KFR_VEC_OPERATOR2(op, asgnop, fn) \ + template <typename T1, typename T2, size_t N> \ + constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const vec<T2, N>& y) \ + { \ + x = intrinsics::fn(x, elemcast<T1>(y)); \ + return x; \ + } \ + template <typename T1, typename T2, size_t N> \ + constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const T2& y) \ + { \ + x = intrinsics::fn(x, T1(y)); \ + return x; \ + } \ + template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \ + constexpr KFR_INTRINSIC vec<C, N> operator op(const vec<T1, N>& x, const T2& y) \ + { \ + return intrinsics::fn(elemcast<C>(x), C(y)); \ + } \ + template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \ + constexpr KFR_INTRINSIC vec<C, N> operator op(const T1& x, const vec<T2, N>& y) \ + { \ + return intrinsics::fn(C(x), elemcast<C>(y)); \ + } \ + template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \ + constexpr KFR_INTRINSIC vec<C, N> operator op(const vec<T1, N>& x, const vec<T2, N>& y) \ + { \ + return intrinsics::fn(elemcast<C>(x), elemcast<C>(y)); \ + } + +#define KFR_VEC_SHIFT_OPERATOR(op, asgnop, fn) \ + template <typename T1, size_t N> \ + constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, unsigned y) \ + { \ + x = intrinsics::fn(x, y); \ + return x; \ + } \ + template <typename T1, typename T2, size_t N> \ + constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const vec<T2, N>& y) \ + { \ + x = intrinsics::fn(x, elemcast<utype<T1>>(y)); \ + return x; \ + } \ + template <typename T, size_t N> \ + constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x, unsigned y) \ + { \ + return intrinsics::fn(x, y); \ + } \ + template <typename T, typename T2, size_t N> \ + constexpr KFR_INTRINSIC vec<T, N> operator op(const T& x, const vec<T2, N>& y) \ + { \ + return intrinsics::fn(innercast<T>(x), elemcast<utype<T>>(y)); \ + } \ + template <typename T, typename T2, size_t N> \ + constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x, const vec<T2, N>& y) \ + { \ + return intrinsics::fn(x, elemcast<utype<T>>(y)); \ + } + +#define KFR_VEC_CMP_OPERATOR(op, fn) \ + template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \ + constexpr KFR_INTRINSIC mask<C, N> operator op(const vec<T1, N>& x, const T2& y) \ + { \ + return intrinsics::fn(elemcast<C>(x), vec<C, N>(y)).asmask(); \ + } \ + template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \ + constexpr KFR_INTRINSIC mask<C, N> operator op(const T1& x, const vec<T2, N>& y) \ + { \ + return intrinsics::fn(vec<C, N>(x), elemcast<C>(y)).asmask(); \ + } \ + template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \ + constexpr KFR_INTRINSIC mask<C, N> operator op(const vec<T1, N>& x, const vec<T2, N>& y) \ + { \ + return intrinsics::fn(elemcast<C>(x), elemcast<C>(y)).asmask(); \ + } + +KFR_VEC_OPERATOR1(-, neg) +KFR_VEC_OPERATOR1(~, bnot) + +KFR_VEC_OPERATOR2(+, +=, add) +KFR_VEC_OPERATOR2(-, -=, sub) +KFR_VEC_OPERATOR2(*, *=, mul) +KFR_VEC_OPERATOR2(/, /=, div) + +KFR_VEC_OPERATOR2(&, &=, band) +KFR_VEC_OPERATOR2(|, |=, bor) +KFR_VEC_OPERATOR2 (^, ^=, bxor) +KFR_VEC_SHIFT_OPERATOR(<<, <<=, shl) +KFR_VEC_SHIFT_OPERATOR(>>, >>=, shr) + +KFR_VEC_CMP_OPERATOR(==, eq) +KFR_VEC_CMP_OPERATOR(!=, ne) +KFR_VEC_CMP_OPERATOR(>=, ge) +KFR_VEC_CMP_OPERATOR(<=, le) +KFR_VEC_CMP_OPERATOR(>, gt) +KFR_VEC_CMP_OPERATOR(<, lt) + +template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>, + KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))> +KFR_INTRINSIC mask<C, N> operator&(const mask<T1, N>& x, const mask<T2, N>& y)CMT_NOEXCEPT +{ + return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) & bitcast<C>(vec<T2, N>(y.v))).v); +} +template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>, + KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))> +KFR_INTRINSIC mask<C, N> operator|(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT +{ + return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) | bitcast<C>(vec<T2, N>(y.v))).v); +} +template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>, + KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))> +KFR_INTRINSIC mask<C, N> operator&&(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT +{ + return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) & bitcast<C>(vec<T2, N>(y.v))).v); +} +template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>, + KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))> +KFR_INTRINSIC mask<C, N> operator||(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT +{ + return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) | bitcast<C>(vec<T2, N>(y.v))).v); +} +template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>, + KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))> +KFR_INTRINSIC mask<C, N> operator^(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT +{ + return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) ^ bitcast<C>(vec<T2, N>(y.v))).v); +} + +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> operator~(const mask<T, N>& x) CMT_NOEXCEPT +{ + return ~x.asvec(); +} +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> operator!(const mask<T, N>& x) CMT_NOEXCEPT +{ + return ~x.asvec(); +} + +KFR_INTRINSIC float bitwisenot(float x) { return fbitcast(~ubitcast(x)); } +KFR_INTRINSIC float bitwiseor(float x, float y) { return fbitcast(ubitcast(x) | ubitcast(y)); } +KFR_INTRINSIC float bitwiseand(float x, float y) { return fbitcast(ubitcast(x) & ubitcast(y)); } +KFR_INTRINSIC float bitwiseandnot(float x, float y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); } +KFR_INTRINSIC float bitwisexor(float x, float y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); } +KFR_INTRINSIC double bitwisenot(double x) { return fbitcast(~ubitcast(x)); } +KFR_INTRINSIC double bitwiseor(double x, double y) { return fbitcast(ubitcast(x) | ubitcast(y)); } +KFR_INTRINSIC double bitwiseand(double x, double y) { return fbitcast(ubitcast(x) & ubitcast(y)); } +KFR_INTRINSIC double bitwiseandnot(double x, double y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); } +KFR_INTRINSIC double bitwisexor(double x, double y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); } + +/// @brief Bitwise Not +template <typename T1> +KFR_INTRINSIC T1 bitwisenot(const T1& x) +{ + return ~x; +} +KFR_FN(bitwisenot) + +/// @brief Bitwise And +template <typename T1, typename T2> +KFR_INTRINSIC common_type<T1, T2> bitwiseand(const T1& x, const T2& y) +{ + return x & y; +} +template <typename T> +constexpr KFR_INTRINSIC T bitwiseand(initialvalue<T>) +{ + return constants<T>::allones(); +} +KFR_FN(bitwiseand) + +/// @brief Bitwise And-Not +template <typename T1, typename T2> +KFR_INTRINSIC common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y) +{ + return x & ~y; +} +template <typename T> +constexpr inline T bitwiseandnot(initialvalue<T>) +{ + return constants<T>::allones(); +} +KFR_FN(bitwiseandnot) + +/// @brief Bitwise Or +template <typename T1, typename T2> +KFR_INTRINSIC common_type<T1, T2> bitwiseor(const T1& x, const T2& y) +{ + return x | y; +} +template <typename T> +constexpr KFR_INTRINSIC T bitwiseor(initialvalue<T>) +{ + return subtype<T>(0); +} +KFR_FN(bitwiseor) + +/// @brief Bitwise Xor (Exclusive Or) +template <typename T1, typename T2> +KFR_INTRINSIC common_type<T1, T2> bitwisexor(const T1& x, const T2& y) +{ + return x ^ y; +} +template <typename T> +constexpr KFR_INTRINSIC T bitwisexor(initialvalue<T>) +{ + return subtype<T>(); +} +KFR_FN(bitwisexor) + +/// @brief Bitwise Left shift +template <typename T1, typename T2> +KFR_INTRINSIC T1 shl(const T1& left, const T2& right) +{ + return left << right; +} +KFR_FN(shl) + +/// @brief Bitwise Right shift +template <typename T1, typename T2> +KFR_INTRINSIC T1 shr(const T1& left, const T2& right) +{ + return left >> right; +} +KFR_FN(shr) + +/// @brief Bitwise Left Rotate +template <typename T1, typename T2> +KFR_INTRINSIC T1 rol(const T1& left, const T2& right) +{ + return shl(left, right) | shr(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right)); +} +KFR_FN(rol) + +/// @brief Bitwise Right Rotate +template <typename T1, typename T2> +KFR_INTRINSIC T1 ror(const T1& left, const T2& right) +{ + return shr(left, right) | shl(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right)); +} +KFR_FN(ror) + +template <typename T> +constexpr KFR_INTRINSIC T add(const T& x) +{ + return x; +} + +/** + * @brief Returns sum of all the arguments passed to a function. + */ +template <typename T1, typename T2, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, T2, Ts...>::value)> +constexpr KFR_INTRINSIC common_type<T1, T2, Ts...> add(const T1& x, const T2& y, const Ts&... rest) +{ + return x + add(y, rest...); +} +template <typename T> +constexpr KFR_INTRINSIC T add(initialvalue<T>) +{ + return T(0); +} +KFR_FN(add) + +/** + * @brief Returns template expression that returns sum of all the arguments passed to a function. + */ +template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)> +KFR_INTRINSIC internal::expression_function<fn::add, E...> add(E&&... x) +{ + return { fn::add(), std::forward<E>(x)... }; +} + +template <typename T1, typename T2> +constexpr KFR_INTRINSIC common_type<T1, T2> sub(const T1& x, const T2& y) +{ + return x - y; +} +template <typename T> +constexpr KFR_INTRINSIC T sub(initialvalue<T>) +{ + return T(0); +} +KFR_FN(sub) + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::sub, E1, E2> sub(E1&& x, E2&& y) +{ + return { fn::sub(), std::forward<E1>(x), std::forward<E2>(y) }; +} + +template <typename T1> +constexpr KFR_INTRINSIC T1 mul(const T1& x) +{ + return x; +} + +/** + * @brief Returns product of all the arguments passed to a function. + */ +template <typename T1, typename T2, typename... Ts> +constexpr KFR_INTRINSIC common_type<T1, T2, Ts...> mul(const T1& x, const T2& y, const Ts&... rest) +{ + return x * mul(y, rest...); +} + +template <typename T> +constexpr KFR_INTRINSIC T mul(initialvalue<T>) +{ + return T(1); +} +KFR_FN(mul) + +/** + * @brief Returns template expression that returns product of all the arguments passed to a function. + */ +template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> +KFR_INTRINSIC internal::expression_function<fn::mul, E...> mul(E&&... x) +{ + return { fn::mul(), std::forward<E>(x)... }; +} + +/** + * @brief Returns square of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +constexpr inline T1 sqr(const T1& x) +{ + return x * x; +} +KFR_FN(sqr) + +/** + * @brief Returns template expression that returns square of x. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::sqr, E1> sqr(E1&& x) +{ + return { fn::sqr(), std::forward<E1>(x) }; +} + +/** + * @brief Returns cube of x. + */ +template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)> +constexpr inline T1 cub(const T1& x) +{ + return sqr(x) * x; +} +KFR_FN(cub) + +/** + * @brief Returns template expression that returns cube of x. + */ +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::cub, E1> cub(E1&& x) +{ + return { fn::cub(), std::forward<E1>(x) }; +} + +template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> +constexpr KFR_INTRINSIC T pow2(const T& x) +{ + return sqr(x); +} + +template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> +constexpr KFR_INTRINSIC T pow3(const T& x) +{ + return cub(x); +} + +template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> +constexpr KFR_INTRINSIC T pow4(const T& x) +{ + return sqr(sqr(x)); +} + +template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)> +constexpr KFR_INTRINSIC T pow5(const T& x) +{ + return pow4(x) * x; +} +KFR_FN(pow2) +KFR_FN(pow3) +KFR_FN(pow4) +KFR_FN(pow5) + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::pow2, E1> pow2(E1&& x) +{ + return { fn::pow2(), std::forward<E1>(x) }; +} +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::pow3, E1> pow3(E1&& x) +{ + return { fn::pow3(), std::forward<E1>(x) }; +} +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::pow4, E1> pow4(E1&& x) +{ + return { fn::pow4(), std::forward<E1>(x) }; +} +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::pow5, E1> pow5(E1&& x) +{ + return { fn::pow5(), std::forward<E1>(x) }; +} + +/// Raise x to the power base \f$ x^{base} \f$ +/// @code +/// CHECK( ipow( 10, 3 ) == 1000 ); +/// CHECK( ipow( 0.5, 2 ) == 0.25 ); +/// @endcode +template <typename T> +constexpr inline T ipow(const T& x, int base) +{ + T xx = x; + T result = T(1); + while (base) + { + if (base & 1) + result *= xx; + base >>= 1; + xx *= xx; + } + return result; +} +KFR_FN(ipow) + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::ipow, E1, E2> ipow(E1&& x, E2&& b) +{ + return { fn::ipow(), std::forward<E1>(x), std::forward<E2>(b) }; +} + +/// Return square of the sum of all arguments +/// @code +/// CHECK(sqrsum(1,2,3) == 36); +/// @endcode +template <typename T1, typename... Ts> +constexpr inline common_type<T1, Ts...> sqrsum(const T1& x, const Ts&... rest) +{ + return sqr(add(x, rest...)); +} + +template <typename T1, typename T2> +constexpr inline common_type<T1, T2> sqrdiff(const T1& x, const T2& y) +{ + return sqr(x - y); +} +KFR_FN(sqrsum) +KFR_FN(sqrdiff) + +/// Division +template <typename T1, typename T2, typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout div(const T1& x, const T2& y) +{ + return static_cast<Tout>(x) / static_cast<Tout>(y); +} +KFR_FN(div) + +/// Remainder +template <typename T1, typename T2, typename Tout = common_type<T1, T2>> +KFR_INTRINSIC Tout rem(const T1& x, const T2& y) +{ + return static_cast<Tout>(x) % static_cast<Tout>(y); +} +KFR_FN(rem) + +/// Negation +template <typename T1> +inline T1 neg(const T1& x) +{ + return -x; +} +KFR_FN(neg) + +/// @brief Fused Multiply-Add +template <typename T1, typename T2, typename T3> +KFR_INTRINSIC constexpr common_type<T1, T2, T3> fmadd(const T1& x, const T2& y, const T3& z) +{ + return x * y + z; +} +/// @brief Fused Multiply-Sub +template <typename T1, typename T2, typename T3> +KFR_INTRINSIC constexpr common_type<T1, T2, T3> fmsub(const T1& x, const T2& y, const T3& z) +{ + return x * y - z; +} +KFR_FN(fmadd) +KFR_FN(fmsub) + +/// @brief Linear blend of `x` and `y` (`c` must be in the range 0...+1) +/// Returns `x + ( y - x ) * c` +template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> +KFR_INTRINSIC constexpr common_type<T1, T2, T3> mix(const T1& c, const T2& x, const T3& y) +{ + return fmadd(c, y - x, x); +} + +/// @brief Linear blend of `x` and `y` (`c` must be in the range -1...+1) +template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)> +KFR_INTRINSIC constexpr common_type<T1, T2, T3> mixs(const T1& c, const T2& x, const T3& y) +{ + return mix(fmadd(c, 0.5, 0.5), x, y); +} +KFR_FN(mix) +KFR_FN(mixs) + +template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> +KFR_INTRINSIC internal::expression_function<fn::mix, E1, E2, E3> mix(E1&& c, E2&& x, E3&& y) +{ + return { fn::mix(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) }; +} + +template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)> +KFR_INTRINSIC internal::expression_function<fn::mixs, E1, E2, E3> mixs(E1&& c, E2&& x, E3&& y) +{ + return { fn::mixs(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) }; +} + +namespace intrinsics +{ + +template <typename T1, typename T2> +constexpr KFR_INTRINSIC common_type<T1, T2> horner(const T1&, const T2& c0) +{ + return c0; +} + +template <typename T1, typename T2, typename T3, typename... Ts> +constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner(const T1& x, const T2& c0, const T3& c1, + const Ts&... values) +{ + return fmadd(horner(x, c1, values...), x, c0); +} + +template <typename T1, typename T2> +constexpr KFR_INTRINSIC common_type<T1, T2> horner_even(const T1&, const T2& c0) +{ + return c0; +} + +template <typename T1, typename T2, typename T3, typename... Ts> +constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner_even(const T1& x, const T2& c0, const T3& c2, + const Ts&... values) +{ + const T1 x2 = x * x; + return fmadd(horner(x2, c2, values...), x2, c0); +} + +template <typename T1, typename T2> +constexpr KFR_INTRINSIC common_type<T1, T2> horner_odd(const T1& x, const T2& c1) +{ + return c1 * x; +} + +template <typename T1, typename T2, typename T3, typename... Ts> +constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner_odd(const T1& x, const T2& c1, const T3& c3, + const Ts&... values) +{ + const T1 x2 = x * x; + return fmadd(horner(x2, c3, values...), x2, c1) * x; +} +} // namespace intrinsics + +/// @brief Calculate polynomial using Horner's method +/// +/// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\) +template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)> +constexpr KFR_INTRINSIC common_type<T1, Ts...> horner(const T1& x, const Ts&... c) +{ + return intrinsics::horner(x, c...); +} +KFR_FN(horner) + +template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> +KFR_INTRINSIC internal::expression_function<fn::horner, E...> horner(E&&... x) +{ + return { fn::horner(), std::forward<E>(x)... }; +} + +/// @brief Calculate polynomial using Horner's method (even powers) +/// +/// ``horner_even(x, 1, 2, 3)`` is equivalent to \(3x^4 + 2x^2 + 1\) +template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)> +constexpr KFR_INTRINSIC common_type<T1, Ts...> horner_even(const T1& x, const Ts&... c) +{ + return intrinsics::horner_even(x, c...); +} +KFR_FN(horner_even) + +template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> +KFR_INTRINSIC internal::expression_function<fn::horner_even, E...> horner_even(E&&... x) +{ + return { fn::horner_even(), std::forward<E>(x)... }; +} + +/// @brief Calculate polynomial using Horner's method (odd powers) +/// +/// ``horner_odd(x, 1, 2, 3)`` is equivalent to \(3x^5 + 2x^3 + 1x\) +template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)> +constexpr KFR_INTRINSIC common_type<T1, Ts...> horner_odd(const T1& x, const Ts&... c) +{ + return intrinsics::horner_odd(x, c...); +} +KFR_FN(horner_odd) + +template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> +KFR_INTRINSIC internal::expression_function<fn::horner_odd, E...> horner_odd(E&&... x) +{ + return { fn::horner_odd(), std::forward<E>(x)... }; +} + +/// @brief Calculate Multiplicative Inverse of `x` +/// Returns `1/x` +template <typename T> +constexpr KFR_INTRINSIC T reciprocal(const T& x) +{ + static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type"); + return subtype<T>(1) / x; +} +KFR_FN(reciprocal) + +template <typename T1, typename T2> +KFR_INTRINSIC common_type<T1, T2> mulsign(const T1& x, const T2& y) +{ + return bitwisexor(x, bitwiseand(y, special_constants<T2>::highbitmask())); +} +KFR_FN(mulsign) + +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y) +{ + return (x & special_constants<T>::highbitmask()) | (y & special_constants<T>::highbitmask()); +} + +/// @brief Swap byte order +template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)> +KFR_INTRINSIC vec<T, N> swapbyteorder(const vec<T, N>& x) +{ + return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x))); +} +template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)> +KFR_INTRINSIC T swapbyteorder(const T& x) +{ + return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x))); +} +template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)> +KFR_INTRINSIC T swapbyteorder(const T& x) +{ + return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x))); +} +template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)> +KFR_INTRINSIC T swapbyteorder(const T& x) +{ + return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x))); +} +KFR_FN(swapbyteorder) + +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> +KFR_INTRINSIC vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b) +{ + return blend<1, 0>(a + b, a - b); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> +KFR_INTRINSIC vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b) +{ + return blend<0, 1>(a + b, a - b); +} +KFR_FN(subadd) +KFR_FN(addsub) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> negeven(const vec<T, N>& x) +{ + return x ^ broadcast<N>(-T(), T()); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> negodd(const vec<T, N>& x) +{ + return x ^ broadcast<N>(T(), -T()); +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::neg, E1> operator-(E1&& e1) +{ + return { fn::neg(), std::forward<E1>(e1) }; +} + +template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)> +KFR_INTRINSIC internal::expression_function<fn::bitwisenot, E1> operator~(E1&& e1) +{ + return { fn::bitwisenot(), std::forward<E1>(e1) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::add, E1, E2> operator+(E1&& e1, E2&& e2) +{ + return { fn::add(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::sub, E1, E2> operator-(E1&& e1, E2&& e2) +{ + return { fn::sub(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::mul, E1, E2> operator*(E1&& e1, E2&& e2) +{ + return { fn::mul(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::div, E1, E2> operator/(E1&& e1, E2&& e2) +{ + return { fn::div(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::bitwiseand, E1, E2> operator&(E1&& e1, E2&& e2) +{ + return { fn::bitwiseand(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::bitwiseor, E1, E2> operator|(E1&& e1, E2&& e2) +{ + return { fn::bitwiseor(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::bitwisexor, E1, E2> operator^(E1&& e1, E2&& e2) +{ + return { fn::bitwisexor(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::shl, E1, E2> operator<<(E1&& e1, E2&& e2) +{ + return { fn::shl(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)> +KFR_INTRINSIC internal::expression_function<fn::shr, E1, E2> operator>>(E1&& e1, E2&& e2) +{ + return { fn::shr(), std::forward<E1>(e1), std::forward<E2>(e2) }; +} + +template <typename T, size_t N1, size_t... Ns> +vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest) +{ + const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...)); + return t.v; +} + +KFR_FN(packtranspose) + +template <typename T, size_t N> +KFR_I_CE mask<T, N>::mask(const base& v) CMT_NOEXCEPT +{ + this->v = base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()).v; +} + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp @@ -0,0 +1,286 @@ +/** @addtogroup types + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "types.hpp" + +namespace kfr +{ + +/// @brief An enumeration representing cpu instruction set +enum class cpu_t : int +{ + generic = 0, +#ifdef CMT_ARCH_X86 + sse2 = 1, + sse3 = 2, + ssse3 = 3, + sse41 = 4, + sse42 = 5, + avx1 = 6, + avx2 = 7, + avx512 = 8, // F, CD, VL, DQ and BW + avx = static_cast<int>(avx1), + lowest = static_cast<int>(sse2), + highest = static_cast<int>(avx512), +#endif +#ifdef CMT_ARCH_ARM + neon = 1, + neon64 = 2, + lowest = static_cast<int>(neon), + highest = static_cast<int>(neon64), +#endif + native = static_cast<int>(CMT_ARCH_NAME), + +#ifdef CMT_ARCH_AVX +#define KFR_HAS_SECONDARY_PLATFORM + secondary = static_cast<int>(sse42), +#else + secondary = static_cast<int>(native), +#endif + + common = generic, // For compatibility + runtime = -1, +}; + +#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native + +template <cpu_t cpu> +using ccpu_t = cval_t<cpu_t, cpu>; + +template <cpu_t cpu> +constexpr ccpu_t<cpu> ccpu{}; + +namespace internal_generic +{ +constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); } +constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); } + +#ifdef CMT_ARCH_X86 +constexpr auto cpu_list = cvals_t<cpu_t, cpu_t::avx512, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3, + cpu_t::sse3, cpu_t::sse2>(); +#else +constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>; +#endif +} // namespace internal_generic + +template <cpu_t cpu> +using cpuval_t = cval_t<cpu_t, cpu>; +template <cpu_t cpu> +constexpr auto cpuval = cpuval_t<cpu>{}; + +constexpr auto cpu_all = + cfilter(internal_generic::cpu_list, internal_generic::cpu_list >= cpuval_t<cpu_t::native>()); + +/// @brief Returns name of the cpu instruction set +CMT_UNUSED static const char* cpu_name(cpu_t set) +{ +#ifdef CMT_ARCH_X86 + static const char* names[] = { "generic", "sse2", "sse3", "ssse3", "sse41", + "sse42", "avx", "avx2", "avx512" }; +#endif +#ifdef CMT_ARCH_ARM + static const char* names[] = { "generic", "neon", "neon64" }; +#endif + if (set >= cpu_t::lowest && set <= cpu_t::highest) + return names[static_cast<size_t>(set)]; + return "-"; +} + +#ifdef CMT_ARCH_X64 +template <int = 0> +constexpr inline const char* bitness_const(const char*, const char* x64) +{ + return x64; +} +template <typename T> +constexpr inline const T& bitness_const(const T&, const T& x64) +{ + return x64; +} +#else +template <int = 0> +constexpr inline const char* bitness_const(const char* x32, const char*) +{ + return x32; +} +template <typename T> +constexpr inline const T& bitness_const(const T& x32, const T&) +{ + return x32; +} +#endif + +template <cpu_t c = cpu_t::native> +struct platform; + +#ifdef CMT_ARCH_X86 +template <> +struct platform<cpu_t::common> +{ + constexpr static size_t native_cache_alignment = 64; + constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1; + constexpr static size_t maximum_vector_alignment = 64; + constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1; + + constexpr static size_t simd_register_count = 1; + + constexpr static size_t common_float_vector_size = 16; + constexpr static size_t common_int_vector_size = 16; + + constexpr static size_t minimum_float_vector_size = 16; + constexpr static size_t minimum_int_vector_size = 16; + + constexpr static size_t native_float_vector_size = 16; + constexpr static size_t native_int_vector_size = 16; + + constexpr static size_t native_vector_alignment = 16; + constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; + + constexpr static bool fast_unaligned = false; +}; +template <> +struct platform<cpu_t::sse2> : platform<cpu_t::common> +{ + constexpr static size_t simd_register_count = bitness_const(8, 16); +}; +template <> +struct platform<cpu_t::sse3> : platform<cpu_t::sse2> +{ +}; +template <> +struct platform<cpu_t::ssse3> : platform<cpu_t::sse3> +{ +}; +template <> +struct platform<cpu_t::sse41> : platform<cpu_t::ssse3> +{ +}; +template <> +struct platform<cpu_t::sse42> : platform<cpu_t::sse41> +{ +}; +template <> +struct platform<cpu_t::avx> : platform<cpu_t::sse42> +{ + constexpr static size_t native_float_vector_size = 32; + + constexpr static size_t native_vector_alignment = 32; + constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; + + constexpr static bool fast_unaligned = true; +}; +template <> +struct platform<cpu_t::avx2> : platform<cpu_t::avx> +{ + constexpr static size_t native_int_vector_size = 32; +}; +template <> +struct platform<cpu_t::avx512> : platform<cpu_t::avx2> +{ + constexpr static size_t native_float_vector_size = 64; + constexpr static size_t native_int_vector_size = 64; + + constexpr static size_t native_vector_alignment = 64; + constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; + + constexpr static size_t simd_register_count = bitness_const(8, 32); +}; +#endif +#ifdef CMT_ARCH_ARM +template <> +struct platform<cpu_t::common> +{ + constexpr static size_t native_cache_alignment = 64; + constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1; + constexpr static size_t maximum_vector_alignment = 16; + constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1; + + constexpr static size_t simd_register_count = 1; + + constexpr static size_t common_float_vector_size = 16; + constexpr static size_t common_int_vector_size = 16; + + constexpr static size_t minimum_float_vector_size = 16; + constexpr static size_t minimum_int_vector_size = 16; + + constexpr static size_t native_float_vector_size = 16; + constexpr static size_t native_int_vector_size = 16; + + constexpr static size_t native_vector_alignment = 16; + constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; + + constexpr static bool fast_unaligned = false; +}; +template <> +struct platform<cpu_t::neon> : platform<cpu_t::common> +{ +}; +template <> +struct platform<cpu_t::neon64> : platform<cpu_t::neon> +{ +}; +#endif + +inline namespace CMT_ARCH_NAME +{ + +/// @brief SIMD vector width for the given cpu instruction set +template <typename T> +constexpr static size_t vector_width = + (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::native_float_vector_size / sizeof(T) + : platform<>::native_int_vector_size / sizeof(T))); + +template <typename T> +constexpr static size_t minimum_vector_width = + (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::minimum_float_vector_size / sizeof(T) + : platform<>::minimum_int_vector_size / sizeof(T))); + +template <typename T> +constexpr static size_t vector_capacity = platform<>::simd_register_count* vector_width<T>; + +#ifdef CMT_COMPILER_MSVC +template <typename T> +constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_width<T> * 2); +#else +template <typename T> +constexpr static size_t maximum_vector_size = const_min( + static_cast<size_t>(32), const_max(size_t(1), platform<>::simd_register_count / 4) * vector_width<T>); +#endif + +template <typename T> +constexpr static bool is_simd_size(size_t size) +{ + return is_poweroftwo(size) && size >= minimum_vector_width<T> && size <= vector_width<T>; +} + +template <typename T, size_t N = vector_width<T>> +struct vec; +template <typename T, size_t N = vector_width<T>> +struct mask; + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/read_write.hpp b/include/kfr/simd/read_write.hpp @@ -0,0 +1,243 @@ +/** @addtogroup read_write + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "shuffle.hpp" +#include "types.hpp" +#include "vec.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template <size_t N, bool A = false, typename T> +KFR_INTRINSIC static vec<T, N> read(const T* src) +{ + return vec<T, N>(src, cbool_t<A>()); +} + +template <bool A = false, size_t N, typename T> +KFR_INTRINSIC static void write(T* dest, const vec<T, N>& value) +{ + value.write(dest, cbool_t<A>()); +} + +template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)> +KFR_INTRINSIC vec<T, Nout> gather(const T* base, size_t index, Indices... indices) +{ + return make_vector(base[index], base[indices]...); +} + +template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)> +KFR_INTRINSIC vec<T, Nout> gather(const T* base) +{ + return make_vector(base[Index], base[Indices]...); +} + +template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0> +KFR_INTRINSIC void scatter(const T* base, const vec<T, N>& value) +{ + base[Index] = value[InIndex]; + scatter<Indices..., T, N, InIndex + 1>(base, value); +} + +namespace internal +{ +template <typename T, size_t N, size_t... Indices> +KFR_INTRINSIC vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>) +{ + return make_vector(base[indices[Indices]]...); +} +template <size_t Nout, size_t Stride, typename T, size_t... Indices> +KFR_INTRINSIC vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>) +{ + return make_vector(base[Indices * Stride]...); +} +template <size_t Nout, size_t groupsize, typename T, size_t... Indices> +KFR_INTRINSIC vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>) +{ + return make_vector(read<groupsize>(base + Indices * groupsize * stride)...); +} +} // namespace internal + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> gather(const T* base, const vec<u32, N>& indices) +{ + return internal::gather(base, indices, csizeseq<N>); +} + +template <size_t Nout, size_t groupsize = 1, typename T> +KFR_INTRINSIC vec<T, Nout * groupsize> gather_stride(const T* base, size_t stride) +{ + return internal::gather_stride_s<Nout, groupsize>(base, stride, csizeseq<Nout>); +} + +template <size_t Nout, size_t Stride, typename T> +KFR_INTRINSIC vec<T, Nout> gather_stride(const T* base) +{ + return internal::gather_stride<Nout, Stride>(base, csizeseq<Nout>); +} + +template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices> +KFR_INTRINSIC vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, + csizes_t<Indices...>) +{ + return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...); +} +template <size_t groupsize = 1, typename T, size_t N, typename IT> +KFR_INTRINSIC vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset) +{ + return gather_helper<groupsize>(base, offset, csizeseq<N>); +} + +template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices> +KFR_INTRINSIC void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value, + csizes_t<Indices...>) +{ + swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)), + 0)... }; +} +template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, size_t... Indices> +KFR_INTRINSIC void scatter_helper_s(T* base, size_t stride, const vec<T, Nout>& value, csizes_t<Indices...>) +{ + swallow{ (write(base + groupsize * stride, slice<Indices * groupsize, groupsize>(value)), 0)... }; +} +template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT> +KFR_INTRINSIC void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value) +{ + return scatter_helper<groupsize>(base, offset, value, csizeseq<N>); +} + +template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT> +KFR_INTRINSIC void scatter_stride(T* base, const vec<T, Nout>& value, size_t stride) +{ + return scatter_helper_s<groupsize>(base, stride, value, csizeseq<N>); +} + +template <typename T, size_t groupsize = 1> +struct stride_pointer : public stride_pointer<const T, groupsize> +{ + template <size_t N> + void write(const vec<T, N>& val, csize_t<N> = csize_t<N>()) + { + kfr::scatter_stride<N, groupsize>(this->ptr, val); + } +}; + +template <typename T, size_t groupsize> +struct stride_pointer<const T, groupsize> +{ + const T* ptr; + const size_t stride; + + template <size_t N> + vec<T, N> read(csize_t<N> = csize_t<N>()) + { + return kfr::gather_stride<N, groupsize>(ptr, stride); + } +}; + +template <typename T> +constexpr T partial_masks[] = { constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T(), + T() }; + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> partial_mask(size_t index) +{ + static_assert(N <= arraysize(partial_masks<T>) / 2, + "N must not be greater than half of partial_masks array"); + return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> partial_mask(size_t index, vec_shape<T, N>) +{ + return partial_mask<T, N>(index); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/shuffle.hpp b/include/kfr/simd/shuffle.hpp @@ -0,0 +1,569 @@ +/** @addtogroup shuffle + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once +#include "constants.hpp" +#include "mask.hpp" +#include "types.hpp" +#include "vec.hpp" + +#include <tuple> +#include <utility> + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ + +template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)> +KFR_INTRINSIC vec<T, Nout> low(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<Nout>); +} + +template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)> +KFR_INTRINSIC vec_shape<T, Nout> low(vec_shape<T, N>) +{ + return {}; +} + +template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)> +KFR_INTRINSIC vec<T, Nout> high(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<Nout, prev_poweroftwo(N - 1)>); +} + +template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)> +KFR_INTRINSIC vec_shape<T, Nout> high(vec_shape<T, N>) +{ + return {}; +} + +template <typename T, size_t... Ns> +KFR_INTRINSIC vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) CMT_NOEXCEPT +{ + return vec<T, csum<size_t, Ns...>()>( + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, Ns>::scalar_size()...>(vs.v...)); +} + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC vec<T, N1 + N2> concat2(const vec<T, N1>& x, const vec<T, N2>& y) CMT_NOEXCEPT +{ + return vec<T, csum<size_t, N1, N2>()>( + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N1>::scalar_size(), + vec<T, N2>::scalar_size()>(x.v, y.v)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N * 4> concat4(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, + const vec<T, N>& d) CMT_NOEXCEPT +{ + return intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N * 2>::scalar_size(), + vec<T, N * 2>::scalar_size()>( + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(), + vec<T, N>::scalar_size()>(a.v, b.v), + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(), + vec<T, N>::scalar_size()>(c.v, d.v)); +} + +template <size_t count, typename T, size_t N, size_t Nout = N* count> +KFR_INTRINSIC vec<T, Nout> repeat(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<Nout> % csize<N>); +} + +template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)> +KFR_INTRINSIC vec<T, Nout> resize(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<Nout> % csize<N>); +} +template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)> +constexpr KFR_INTRINSIC const vec<T, Nout>& resize(const vec<T, N>& x) +{ + return x; +} + +namespace intrinsics +{ + +template <typename T, typename... Ts, size_t... indices, size_t Nin = sizeof...(Ts), + size_t Nout = sizeof...(indices)> +KFR_INTRINSIC vec<T, Nout> broadcast_helper(csizes_t<indices...>, const Ts&... values) +{ + const std::tuple<Ts...> tup(values...); + return vec<T, Nout>(std::get<indices % Nin>(tup)...); +} +} // namespace intrinsics + +template <size_t Nout, typename... Ts, typename C = typename std::common_type<Ts...>::type> +KFR_INTRINSIC vec<C, Nout> broadcast(const Ts&... values) +{ + return intrinsics::broadcast_helper<C>(csizeseq<Nout>, values...); +} +KFR_FN(broadcast) + +template <size_t Ncount, typename T, size_t N> +KFR_INTRINSIC vec<T, N + Ncount> padhigh(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<N + Ncount>); +} +KFR_FN(padhigh) + +template <size_t Ncount, typename T, size_t N> +KFR_INTRINSIC vec<T, N + Ncount> padlow(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<N + Ncount, 0 - Ncount>); +} +KFR_FN(padlow) + +template <size_t Nout, typename T> +KFR_INTRINSIC vec<T, Nout> extend(const vec<T, 1>& x) +{ + return vec<T, Nout>(x.front()); +} +template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)> +KFR_INTRINSIC vec<T, Nout> extend(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<Nout>); +} +template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)> +constexpr KFR_INTRINSIC const vec<T, Nout>& extend(const vec<T, N>& x) +{ + return x; +} +KFR_FN(extend) + +template <size_t start, size_t count, typename T, size_t N> +KFR_INTRINSIC vec<T, count> slice(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<count, start>); +} +template <size_t start, size_t count, typename T, size_t N> +KFR_INTRINSIC vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.shuffle(y, csizeseq<count, start>); +} +KFR_FN(slice) + +template <size_t start, size_t count, typename T, size_t N> +KFR_INTRINSIC vec<T, N> replace(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.shuffle(y, csizeseq<N> + (csizeseq<N> >= csize<start> && csizeseq<N> < csize<start + count>)*N); +} +KFR_FN(replace) + +template <size_t, typename T, size_t N> +KFR_INTRINSIC void split(const vec<T, N>&) +{ +} +template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args> +KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args) +{ + out = x.shuffle(csizeseq<Nout, start>); + split<start + Nout>(x, std::forward<Args>(args)...); +} +template <typename T, size_t N> +KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, N / 2>& low, vec<T, N / 2>& high) +{ + low = x.shuffle(csizeseq<N / 2, 0>); + high = x.shuffle(csizeseq<N / 2, N / 2>); +} +template <typename T, size_t N> +KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, N / 4>& w0, vec<T, N / 4>& w1, vec<T, N / 4>& w2, + vec<T, N / 4>& w3) +{ + w0 = x.shuffle(csizeseq<N / 4, 0>); + w1 = x.shuffle(csizeseq<N / 4, N / 4>); + w2 = x.shuffle(csizeseq<N / 4, 2 * N / 4>); + w3 = x.shuffle(csizeseq<N / 4, 3 * N / 4>); +} +KFR_FN(split) + +template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total> +KFR_INTRINSIC vec<T, Nout> part(const vec<T, N>& x) +{ + static_assert(N % total == 0, "N % total == 0"); + return x.shuffle(csizeseq<Nout, number * Nout>); +} +KFR_FN(part) + +template <size_t start, size_t count, typename T, size_t N> +KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.shuffle(y, csizeseq<count, start>); +} + +template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)> +KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y) +{ + return x.shuffle(y.shuffle(csizeseq<N1>), csizeseq<N1 * 2>).shuffle(csizeseq<count, start>); +} + +template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)> +KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y) +{ + return x.shuffle(csizeseq<N2, -(N2 - N1)>) + .shuffle(y, csizeseq<N2 * 2>) + .shuffle(csizeseq<count, N2 - N1 + start>); +} + +KFR_FN(concat_and_slice) + +template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout > N)> +KFR_INTRINSIC vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T()) +{ + static_assert(Nout > N, "Nout > N"); + return concat(x, broadcast<Nout - N>(newvalue)); +} +template <size_t Nout, typename T, typename TS> +constexpr KFR_INTRINSIC const vec<T, Nout>& widen(const vec<T, Nout>& x, TS) +{ + return x; +} +KFR_FN(widen) + +template <size_t Nout, typename T, size_t N> +KFR_INTRINSIC vec<T, Nout> narrow(const vec<T, N>& x) +{ + static_assert(Nout <= N, "Nout <= N"); + return slice<0, Nout>(x); +} +KFR_FN(narrow) + +template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)> +KFR_INTRINSIC vec<T, Nout> even(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq<Nout / group, 0, 2>)); +} +KFR_FN(even) + +template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)> +KFR_INTRINSIC vec<T, Nout> odd(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq<Nout / group, 1, 2>)); +} +KFR_FN(odd) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> dupeven(const vec<T, N>& x) +{ + static_assert(N % 2 == 0, "N must be even"); + return x.shuffle(csizeseq<N, 0, 1> & ~csize<1>); +} +KFR_FN(dupeven) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> dupodd(const vec<T, N>& x) +{ + static_assert(N % 2 == 0, "N must be even"); + return x.shuffle(csizeseq<N, 0, 1> | csize<1>); +} +KFR_FN(dupodd) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N * 2> duphalfs(const vec<T, N>& x) +{ + return x.shuffle(csizeseq<N * 2> % csize<N>); +} +KFR_FN(duphalfs) + +template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> +KFR_INTRINSIC vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y, + elements_t<Indices...> i = elements_t<Indices...>()) +{ + return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] + + csizeseq_t<N>() / csize_t<count>() * csize_t<count>()); +} +KFR_FN(shuffle) + +template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> +KFR_INTRINSIC vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y, + elements_t<Indices...> i = elements_t<Indices...>()) +{ + return x.shuffle(y, scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] + + csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>())); +} +KFR_FN(shufflegroups) + +template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> +KFR_INTRINSIC vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>()) +{ + return x.shuffle(i[csizeseq_t<N>() % csize_t<count>()] + + csizeseq_t<N>() / csize_t<count>() * csize_t<count>()); +} +KFR_FN(permute) + +template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)> +KFR_INTRINSIC vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>()) +{ + return x.shuffle(scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] + + csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>())); +} +KFR_FN(permutegroups) + +namespace internal +{ + +template <typename T, size_t Nout, typename Fn, size_t... Indices> +constexpr KFR_INTRINSIC vec<T, Nout> generate_vector(csizes_t<Indices...>) +{ + return make_vector(static_cast<T>(Fn()(Indices))...); +} +} // namespace internal + +template <typename T, size_t Nout, typename Fn> +constexpr KFR_INTRINSIC vec<T, Nout> generate_vector() +{ + return internal::generate_vector<T, Nout, Fn>(cvalseq_t<size_t, Nout>()); +} +KFR_FN(generate_vector) + +namespace internal +{ +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> evenmask() +{ + return broadcast<N>(maskbits<T>(true), maskbits<T>(false)); +} +template <typename T, size_t N> +KFR_INTRINSIC mask<T, N> oddmask() +{ + return broadcast<N>(maskbits<T>(false), maskbits<T>(true)); +} +} // namespace internal + +template <typename T, size_t N, size_t Nout = N * 2> +KFR_INTRINSIC vec<T, Nout> dup(const vec<T, N>& x) +{ + return x.shuffle(csizeseq_t<Nout>() / csize_t<2>()); +} +KFR_FN(dup) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> duplow(const vec<T, N>& x) +{ + return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>()); +} +KFR_FN(duplow) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> duphigh(const vec<T, N>& x) +{ + return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>() + csize_t<N - N / 2>()); +} +KFR_FN(duphigh) + +template <size_t... Indices, typename T, size_t N> +KFR_INTRINSIC vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y, + elements_t<Indices...> i = elements_t<Indices...>()) +{ + return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] * csize_t<N>() + csizeseq_t<N>()); +} +KFR_FN(blend) + +template <size_t elements = 2, typename T, size_t N> +KFR_INTRINSIC vec<T, N> swap(const vec<T, N>& x) +{ + return x.shuffle(csizeseq_t<N>() ^ csize_t<elements - 1>()); +} +CMT_FN_TPL((size_t elements), (elements), swap) + +template <size_t shift, typename T, size_t N> +KFR_INTRINSIC vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi) +{ + return shift == 0 ? lo : (shift == N ? hi : hi.shuffle(lo, csizeseq_t<N, N - shift>())); +} + +template <size_t amount, typename T, size_t N> +KFR_INTRINSIC vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>()) +{ + static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N"); + return x.shuffle(csizeseq_t<N, N - amount>() % csize_t<N>()); +} +KFR_FN(rotateright) + +template <size_t amount, typename T, size_t N> +KFR_INTRINSIC vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>()) +{ + static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N"); + return x.shuffle(csizeseq_t<N, amount>() % csize_t<N>()); +} +KFR_FN(rotateleft) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> insertright(T x, const vec<T, N>& y) +{ + return concat_and_slice<1, N>(y, vec<T, 1>(x)); +} +KFR_FN(insertright) + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> insertleft(T x, const vec<T, N>& y) +{ + return concat_and_slice<0, N>(vec<T, 1>(x), y); +} +KFR_FN(insertleft) + +template <size_t side1, size_t group = 1, typename T, size_t N, size_t size = N / group, + size_t side2 = size / side1, KFR_ENABLE_IF(size > 3)> +KFR_INTRINSIC vec<T, N> transpose(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + + csizeseq_t<size>() / csize_t<side2>())); +} +template <size_t side, size_t group = 1, typename T, size_t N, KFR_ENABLE_IF(N / group <= 3)> +KFR_INTRINSIC vec<T, N> transpose(const vec<T, N>& x) +{ + return x; +} +template <typename T, size_t N> +KFR_INTRINSIC vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x) +{ + return vec<vec<T, N>, N>::from_flatten(transpose<2>(x.flatten())); +} +KFR_FN(transpose) + +template <size_t side2, size_t group = 1, typename T, size_t N, size_t size = N / group, + size_t side1 = size / side2, KFR_ENABLE_IF(size > 3)> +KFR_INTRINSIC vec<T, N> transposeinverse(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + + csizeseq_t<size>() / csize_t<side2>())); +} +template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)> +KFR_INTRINSIC vec<T, N> transposeinverse(const vec<T, N>& x) +{ + return x; +} +KFR_FN(transposeinverse) + +template <size_t side, typename T, size_t N> +KFR_INTRINSIC vec<T, N> ctranspose(const vec<T, N>& x) +{ + return transpose<side, 2>(x); +} +KFR_FN(ctranspose) + +template <size_t side, typename T, size_t N> +KFR_INTRINSIC vec<T, N> ctransposeinverse(const vec<T, N>& x) +{ + return transposeinverse<side, 2>(x); +} +KFR_FN(ctransposeinverse) + +template <size_t group = 1, typename T, size_t N, size_t Nout = N * 2, size_t size = Nout / group, + size_t side2 = 2, size_t side1 = size / side2> +KFR_INTRINSIC vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y) +{ + return x.shuffle(y, scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + + csizeseq_t<size>() / csize_t<side2>())); +} +KFR_FN(interleave) + +template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side2 = 2, + size_t side1 = size / side2> +KFR_INTRINSIC vec<T, N> interleavehalfs(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + + csizeseq_t<size>() / csize_t<side2>())); +} +KFR_FN(interleavehalfs) + +template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side1 = 2, + size_t side2 = size / side1> +KFR_INTRINSIC vec<T, N> splitpairs(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() + + csizeseq_t<size>() / csize_t<side2>())); +} +KFR_FN(splitpairs) + +template <size_t group = 1, typename T, size_t N, size_t size = N / group> +KFR_INTRINSIC vec<T, N> reverse(const vec<T, N>& x) +{ + return x.shuffle(scale<group>(csizeseq_t<size, size - 1, -1>())); +} +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x) +{ + return swap<N1>(x.flatten()).v; +} +KFR_FN(reverse) + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y) +{ + static_assert(N2 <= N1, "N2 <= N1"); + return x.shuffle(extend<N1>(y), (csizeseq_t<N1>() < csize_t<N2>()) * csize_t<N1>() + csizeseq_t<N1>()); +} +KFR_FN(combine) + +namespace internal +{ +template <size_t start, size_t stride> +struct generate_index +{ + KFR_INTRINSIC constexpr size_t operator()(size_t index) const { return start + index * stride; } +}; +template <size_t start, size_t size, int on, int off> +struct generate_onoff +{ + KFR_INTRINSIC constexpr size_t operator()(size_t index) const + { + return index >= start && index < start + size ? on : off; + } +}; +} // namespace internal + +template <typename T, size_t N, size_t start = 0, size_t stride = 1> +constexpr KFR_INTRINSIC vec<T, N> enumerate() +{ + return generate_vector<T, N, internal::generate_index<start, stride>>(); +} +template <size_t start = 0, size_t stride = 1, typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> enumerate(vec_shape<T, N>) +{ + return generate_vector<T, N, internal::generate_index<start, stride>>(); +} +KFR_FN(enumerate) + +template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0> +constexpr KFR_INTRINSIC vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>()) +{ + return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>(); +} +template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> onoff(vec_shape<T, N>, cint_t<on> = cint_t<on>(), + cint_t<off> = cint_t<off>()) +{ + return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>(); +} +KFR_FN(onoff) + +} // namespace CMT_ARCH_NAME +} // namespace kfr +#define KFR_SHUFFLE_SPECIALIZATIONS 1 +#include "impl/specializations.i" diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp @@ -0,0 +1,372 @@ +/** @addtogroup types + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../kfr.h" + +#include "impl/intrinsics.h" + +#include <climits> + +#include <cmath> +#include <limits> +#include <random> + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers") + +#ifdef KFR_TESTING +#include "../testo/testo.hpp" +#endif + +#include "../cometa.hpp" +#include "../cometa/numeric.hpp" + +namespace kfr +{ + +// Include all from CoMeta library +using namespace cometa; + +using cometa::fbase; +using cometa::fmax; + +// primary template (used for zero types) +template <typename... T> +struct common_type_impl +{ +}; + +template <typename... T> +using decay_common = decay<common_type_impl<T...>>; + +template <typename T1, typename T2, template <typename TT> class result_type, typename = void> +struct common_type_from_subtypes +{ +}; + +template <typename T1, typename T2, template <typename TT> class result_type> +struct common_type_from_subtypes<T1, T2, result_type, void_t<typename common_type_impl<T1, T2>::type>> +{ + using type = result_type<typename common_type_impl<T1, T2>::type>; +}; + +template <typename T> +struct common_type_impl<T> +{ + using type = decay<T>; +}; + +template <typename T1, typename T2> +using common_for_two = decltype(false ? std::declval<T1>() : std::declval<T2>()); + +template <typename T1, typename T2, typename = void> +struct common_type_2_default +{ +}; + +template <typename T1, typename T2> +struct common_type_2_default<T1, T2, void_t<common_for_two<T1, T2>>> +{ + using type = std::decay_t<common_for_two<T1, T2>>; +}; + +template <typename T1, typename T2, typename D1 = decay<T1>, typename D2 = decay<T2>> +struct common_type_2_impl : common_type_impl<D1, D2> +{ +}; + +template <typename D1, typename D2> +struct common_type_2_impl<D1, D2, D1, D2> : common_type_2_default<D1, D2> +{ +}; + +template <typename T1, typename T2> +struct common_type_impl<T1, T2> : common_type_2_impl<T1, T2> +{ +}; + +template <typename AlwaysVoid, typename T1, typename T2, typename... R> +struct common_type_multi_impl +{ +}; + +template <typename T1, typename T2, typename... R> +struct common_type_multi_impl<void_t<typename common_type_impl<T1, T2>::type>, T1, T2, R...> + : common_type_impl<typename common_type_impl<T1, T2>::type, R...> +{ +}; + +template <typename T1, typename T2, typename... R> +struct common_type_impl<T1, T2, R...> : common_type_multi_impl<void, T1, T2, R...> +{ +}; + +template <typename... T> +using common_type = typename common_type_impl<T...>::type; + +constexpr ctypes_t<i8, i16, i32, i64> signed_types{}; +constexpr ctypes_t<u8, u16, u32, u64> unsigned_types{}; +constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64> integer_types{}; +constexpr ctypes_t<f32 +#ifdef KFR_NATIVE_F64 + , + f64 +#endif + > + float_types{}; +constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64, f32 +#ifdef KFR_NATIVE_F64 + , + f64 +#endif + > + numeric_types{}; + +constexpr csizes_t<1, 2, 3, 4, 8, 16, 32, 64> test_vector_sizes{}; + +template <template <typename, size_t> class vec_tpl, typename T, + typename sizes = +#ifdef KFR_EXTENDED_TESTS + cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<64 / sizeof(T)>)> +#else + csizes_t<1> +#endif + > +struct vector_types_for_size_t_impl; + +template <template <typename, size_t> class vec_tpl, typename T, size_t... sizes> +struct vector_types_for_size_t_impl<vec_tpl, T, csizes_t<sizes...>> +{ + using type = ctypes_t<vec_tpl<T, sizes>...>; +}; + +template <template <typename, size_t> class vec_tpl, typename T> +using vector_types_for_size_t = typename vector_types_for_size_t_impl<vec_tpl, T>::type; + +template <template <typename, size_t> class vec_tpl> +using signed_vector_types_t = + concat_lists<vector_types_for_size_t<vec_tpl, i8>, vector_types_for_size_t<vec_tpl, i16>, + vector_types_for_size_t<vec_tpl, i32>, vector_types_for_size_t<vec_tpl, i64>>; + +template <template <typename, size_t> class vec_tpl> +constexpr signed_vector_types_t<vec_tpl> signed_vector_types{}; + +template <template <typename, size_t> class vec_tpl> +using unsigned_vector_types_t = + concat_lists<vector_types_for_size_t<vec_tpl, u8>, vector_types_for_size_t<vec_tpl, u16>, + vector_types_for_size_t<vec_tpl, u32>, vector_types_for_size_t<vec_tpl, u64>>; + +template <template <typename, size_t> class vec_tpl> +constexpr unsigned_vector_types_t<vec_tpl> unsigned_vector_types{}; + +template <template <typename, size_t> class vec_tpl> +using integer_vector_types_t = concat_lists<signed_vector_types_t<vec_tpl>, unsigned_vector_types_t<vec_tpl>>; + +template <template <typename, size_t> class vec_tpl> +constexpr integer_vector_types_t<vec_tpl> integer_vector_types{}; + +template <template <typename, size_t> class vec_tpl> +using float_vector_types_t = concat_lists<vector_types_for_size_t<vec_tpl, f32> +#ifdef KFR_NATIVE_F64 + , + vector_types_for_size_t<vec_tpl, f64> +#endif + >; + +template <template <typename, size_t> class vec_tpl> +constexpr float_vector_types_t<vec_tpl> float_vector_types{}; + +template <template <typename, size_t> class vec_tpl> +constexpr concat_lists<integer_vector_types_t<vec_tpl>, float_vector_types_t<vec_tpl>> numeric_vector_types{}; + +struct u24 +{ + u8 raw[3]; +}; + +struct i24 +{ + u8 raw[3]; + + constexpr i24(i32 x) CMT_NOEXCEPT : raw{} + { + raw[0] = x & 0xFF; + raw[1] = (x >> 8) & 0xFF; + raw[2] = (x >> 16) & 0xFF; + } + + constexpr i32 as_int() const CMT_NOEXCEPT + { + return static_cast<i32>(raw[0]) | static_cast<i32>(raw[1] << 8) | + (static_cast<i32>(raw[2] << 24) >> 8); + } + + operator int() const CMT_NOEXCEPT { return as_int(); } +}; + +struct f16 +{ + u16 raw; +}; + +template <size_t bits> +struct bitmask +{ + using type = conditional<(bits > 32), uint64_t, + conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>; + + bitmask(type val) : value(val) {} + + type value; +}; + +template <typename T> +struct maskbit +{ + bool value; +}; + +namespace fn_generic +{ +///@copybrief cometa::pass_through +using pass_through = cometa::fn_pass_through; + +///@copybrief cometa::noop +using noop = cometa::fn_noop; + +///@copybrief cometa::get_first +using get_first = cometa::fn_get_first; + +///@copybrief cometa::get_second +using get_second = cometa::fn_get_second; + +///@copybrief cometa::get_third +using get_third = cometa::fn_get_third; + +///@copybrief cometa::returns +template <typename T> +using returns = cometa::fn_returns<T>; +} // namespace fn_generic + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes") + +template <typename T, bool A> +struct struct_with_alignment +{ + using pointer = struct_with_alignment*; + using const_pointer = const struct_with_alignment*; + T value; + KFR_MEM_INTRINSIC void operator=(T value) { this->value = value; } +}; + +#ifdef CMT_COMPILER_MSVC +#define KFR_UNALIGNED_POINTER __unaligned +#else +#define KFR_UNALIGNED_POINTER +#endif + +template <typename T> +struct struct_with_alignment<T, false> +{ + using pointer = KFR_UNALIGNED_POINTER struct_with_alignment*; + using const_pointer = KFR_UNALIGNED_POINTER const struct_with_alignment*; + T value; + KFR_MEM_INTRINSIC void operator=(T value) { this->value = value; } +} +#ifdef CMT_GNU_ATTRIBUTES +__attribute__((__packed__, __may_alias__)) // +#endif +; + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +/// @brief Fills a value with zeros +template <typename T1> +KFR_INTRINSIC void zeroize(T1& value) +{ + builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1)); +} + +/// @brief Used to determine the initial value for reduce functions +template <typename T> +struct initialvalue +{ +}; + +template <typename T> +struct is_simd_type + : std::integral_constant< + bool, std::is_same<T, float>::value || std::is_same<T, double>::value || + std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value || + std::is_same<T, short>::value || std::is_same<T, unsigned short>::value || + std::is_same<T, int>::value || std::is_same<T, unsigned int>::value || + std::is_same<T, long>::value || std::is_same<T, unsigned long>::value || + std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value> +{ +}; + +template <typename T, size_t N> +struct vec_shape +{ + using value_type = T; + constexpr static size_t size() CMT_NOEXCEPT { return N; } + constexpr vec_shape() CMT_NOEXCEPT = default; + + using scalar_type = subtype<T>; + constexpr static size_t scalar_size() CMT_NOEXCEPT { return N * compound_type_traits<T>::width; } +}; + +constexpr size_t index_undefined = static_cast<size_t>(-1); + +struct czeros_t +{ +}; +struct cones_t +{ +}; +constexpr czeros_t czeros{}; +constexpr cones_t cones{}; + +using caligned_t = cbool_t<true>; +using cunaligned_t = cbool_t<false>; + +constexpr caligned_t caligned{}; +constexpr cunaligned_t cunaligned{}; + +#ifdef CMT_INTRINSICS_IS_CONSTEXPR +#define KFR_I_CE constexpr +#else +#define KFR_I_CE +#endif + +#define avoid_odr_use(x) static_cast<decltype(x)>(x) + +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp @@ -0,0 +1,1283 @@ +/** @addtogroup types + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../version.hpp" +#include "constants.hpp" +#include "impl/backend.hpp" + +/** + * @brief Internal macro for functions + */ +#define KFR_FN(FN) \ + namespace fn \ + { \ + struct FN \ + { \ + template <typename... Args> \ + CMT_INLINE_MEMBER decltype(::kfr::FN(std::declval<Args>()...)) operator()(Args&&... args) const \ + { \ + return ::kfr::FN(std::forward<Args>(args)...); \ + } \ + }; \ + } + +/** + * @brief Internal macro for functions + */ +#define KFR_I_FN(FN) \ + namespace fn \ + { \ + struct FN \ + { \ + template <typename... Args> \ + CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()( \ + Args&&... args) const \ + { \ + return ::kfr::intrinsics::FN(std::forward<Args>(args)...); \ + } \ + }; \ + } + +#define KFR_I_FN_FULL(FN, FULLFN) \ + namespace fn \ + { \ + struct FN \ + { \ + template <typename... Args> \ + CMT_INLINE_MEMBER decltype(FULLFN(std::declval<Args>()...)) operator()(Args&&... args) const \ + { \ + return FULLFN(std::forward<Args>(args)...); \ + } \ + }; \ + } + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked") + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4814)) + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ + +template <typename T, size_t N> +struct alignas(next_poweroftwo(sizeof(T)) * next_poweroftwo(N)) portable_vec +{ + static constexpr vec_shape<T, N> shape() CMT_NOEXCEPT { return {}; } + + static_assert(N > 0 && N <= 1024, "Invalid vector size"); + + static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type"); + + // type and size + using value_type = T; + + constexpr static size_t size() CMT_NOEXCEPT { return N; } + + T elem[N]; +}; + +template <typename T, size_t N> +struct vec; + +template <typename T, size_t N> +struct mask; + +template <typename T, size_t N> +struct vec_halves +{ + vec<T, prev_poweroftwo(N - 1)> low; + vec<T, N - prev_poweroftwo(N - 1)> high; +}; + +template <typename T> +struct vec_halves<T, 1> +{ + T val; +}; + +namespace internal +{ + +// scalar to scalar +template <typename To, typename From> +struct conversion +{ + static_assert(std::is_convertible<From, To>::value, ""); + + static To cast(const From& value) { return value; } +}; + +template <typename T> +struct compoundcast +{ + static vec<T, 1> to_flat(const T& x) { return vec<T, 1>(x); } + static T from_flat(const vec<T, 1>& x) { return x.front(); } +}; +template <typename T, size_t N> +struct compoundcast<vec<T, N>> +{ + static const vec<T, N>& to_flat(const vec<T, N>& x) { return x; } + static const vec<T, N>& from_flat(const vec<T, N>& x) { return x; } +}; +template <typename T, size_t N1, size_t N2> +struct compoundcast<vec<vec<T, N1>, N2>> +{ + static vec<T, N1 * N2> to_flat(const vec<vec<T, N1>, N2>& x) { return x; } + static vec<vec<T, N1>, N2> from_flat(const vec<T, N1 * N2>& x) { return x; } +}; +} // namespace internal + +template <typename T, size_t N> +struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<T>::deep_subtype, + N * compound_type_traits<T>::deep_width>), + const_min(size_t(platform<>::native_vector_alignment), + next_poweroftwo(sizeof(typename compound_type_traits<T>::deep_subtype) * + N * compound_type_traits<T>::deep_width)))) vec +{ + static constexpr vec_shape<T, N> shape() CMT_NOEXCEPT { return {}; } + + // type and size + using value_type = T; + + constexpr static size_t size() CMT_NOEXCEPT { return N; } + + using ST = typename compound_type_traits<T>::deep_subtype; + using scalar_type = ST; + + enum : size_t + { + SW = compound_type_traits<T>::deep_width, + SN = N * SW + }; + + constexpr static size_t scalar_size() CMT_NOEXCEPT { return SN; } + + static_assert(is_simd_type<scalar_type>::value, "Invalid vector type"); + + static_assert(scalar_size() > 0 && scalar_size() <= 1024, "Invalid vector size"); + + using mask_t = mask<T, N>; + + using simd_type = intrinsics::simd<ST, SN>; + using uvalue_type = utype<T>; + using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>; + + using uscalar_type = utype<ST>; + using iuscalar_type = conditional<is_i_class<ST>::value, ST, uscalar_type>; + + using usimd_type = intrinsics::simd<uscalar_type, SN>; + using iusimd_type = intrinsics::simd<iuscalar_type, SN>; + + // constructors and assignment + // from SIMD + KFR_MEM_INTRINSIC vec(const simd_type& simd) CMT_NOEXCEPT : v(simd) {} + // default + KFR_MEM_INTRINSIC constexpr vec() CMT_NOEXCEPT = default; + // copy + KFR_MEM_INTRINSIC constexpr vec(const vec& value) CMT_NOEXCEPT = default; + // move + KFR_MEM_INTRINSIC constexpr vec(vec&&) CMT_NOEXCEPT = default; + // assignment + KFR_MEM_INTRINSIC constexpr vec& operator=(const vec&) CMT_NOEXCEPT = default; + + // from scalar + template <typename U, + KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT + : v(intrinsics::simd_broadcast(intrinsics::simd_t<ST, SN>{}, static_cast<ST>(s))) + { + } + template <typename U, + KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT + : v(intrinsics::simd_shuffle(intrinsics::simd_t<ST, SW>{}, + internal::compoundcast<T>::to_flat(static_cast<T>(s)).v, + csizeseq<SN> % csize<SW>, overload_auto)) + { + } + + // from list + template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT + : v(intrinsics::simd_make(ctype<T>, s0, s1, static_cast<value_type>(rest)...)) + { + } + template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT + : v(intrinsics::simd_concat<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>( + internal::compoundcast<T>::to_flat(s0).v, internal::compoundcast<T>::to_flat(s1).v, + internal::compoundcast<T>::to_flat(static_cast<T>(rest)).v...)) + { + } + + // from vector of another type + template <typename U, + KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT + : v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v)) + { + } + template <typename U, + KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT + : v(internal::conversion<vec<T, N>, vec<U, N>>::cast(x).v) + { + } + + // from list of vectors + template <size_t... Ns, typename = enable_if<csum<size_t, Ns...>() == N>> + KFR_MEM_INTRINSIC vec(const vec<T, Ns>&... vs) CMT_NOEXCEPT + : v(intrinsics::simd_concat<ST, (SW * Ns)...>(vs.v...)) + { + } + + KFR_MEM_INTRINSIC vec(const portable_vec<T, N>& p) CMT_NOEXCEPT : vec(bitcast_anything<vec>(p)) {} + + KFR_MEM_INTRINSIC operator portable_vec<T, N>() const CMT_NOEXCEPT + { + return bitcast_anything<portable_vec<T, N>>(*this); + } + + KFR_MEM_INTRINSIC vec(czeros_t) CMT_NOEXCEPT : v(intrinsics::simd_zeros<ST, SN>()) {} + + KFR_MEM_INTRINSIC vec(cones_t) CMT_NOEXCEPT : v(intrinsics::simd_allones<ST, SN>()) {} + + template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)> + KFR_MEM_INTRINSIC static vec frombits(const vec<U, M>& v) CMT_NOEXCEPT + { + return intrinsics::simd_bitcast( + intrinsics::simd_cvt_t<ST, typename vec<U, M>::scalar_type, vec<U, M>::scalar_size()>{}, v.v); + } + + // shuffle + template <size_t... indices> + KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...> i) const CMT_NOEXCEPT + { + return vec<value_type, sizeof...(indices)>( + intrinsics::simd_shuffle(intrinsics::simd_t<ST, SN>{}, v, scale<SW>(i), overload_auto)); + } + + template <size_t... indices> + KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(const vec& y, + csizes_t<indices...> i) const CMT_NOEXCEPT + { + return vec<value_type, sizeof...(indices)>( + intrinsics::simd_shuffle(intrinsics::simd2_t<ST, SN, SN>{}, v, y.v, scale<SW>(i), overload_auto)); + } + + // element access + struct element; + + KFR_MEM_INTRINSIC constexpr value_type operator[](size_t index) const& CMT_NOEXCEPT { return get(index); } + + KFR_MEM_INTRINSIC constexpr value_type operator[](size_t index) && CMT_NOEXCEPT { return get(index); } + + KFR_MEM_INTRINSIC constexpr element operator[](size_t index) & CMT_NOEXCEPT { return { *this, index }; } + + KFR_MEM_INTRINSIC value_type front() const CMT_NOEXCEPT { return get(csize<0>); } + + KFR_MEM_INTRINSIC value_type back() const CMT_NOEXCEPT { return get(csize<N - 1>); } + + template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT + { + return intrinsics::simd_get_element<T, N>(v, index); + } + template <int dummy = 0, typename = void, + KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT + { + return this->s[index]; + } + + template <size_t index, KFR_ENABLE_IF(index < 1024 && compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT + { + return intrinsics::simd_get_element<T, N>(v, csize<index>); + } + template <size_t index, typename = void, + KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT + { + return internal::compoundcast<T>::from_flat(intrinsics::simd_shuffle( + intrinsics::simd_t<ST, SN>{}, v, csizeseq<SW, SW * index>, overload_auto)); + } + + template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT + { + v = intrinsics::simd_set_element<T, N>(v, index, s); + } + template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT + { + this->s[index] = s; + } + + template <size_t index, KFR_ENABLE_IF(index < 1024 && compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT + { + v = intrinsics::simd_set_element<T, N>(v, csize<index>, s); + } + template <size_t index, typename = void, + KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)> + KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT + { + this->s[index] = s; + } + + struct element + { + constexpr operator value_type() const CMT_NOEXCEPT { return v.get(index); } + + KFR_MEM_INTRINSIC element& operator=(const value_type& s) CMT_NOEXCEPT + { + v.set(index, s); + return *this; + } + + KFR_MEM_INTRINSIC element& operator=(const element& s) CMT_NOEXCEPT + { + v.set(index, static_cast<value_type>(s)); + return *this; + } + + template <typename U, size_t M> + KFR_MEM_INTRINSIC element& operator=(const typename vec<U, M>::element& s) CMT_NOEXCEPT + { + v.set(index, static_cast<value_type>(static_cast<U>(s))); + return *this; + } + + vec& v; + size_t index; + }; + + // read/write + template <bool aligned = false> + KFR_MEM_INTRINSIC explicit constexpr vec(const value_type* src, + cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT + : v(intrinsics::simd_read<SN, aligned>(ptr_cast<ST>(src))) + { + } + + template <bool aligned = false> + KFR_MEM_INTRINSIC const vec& write(value_type* dest, + cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT + { + intrinsics::simd_write<aligned, SN>(ptr_cast<ST>(dest), v); + return *this; + } + + KFR_MEM_INTRINSIC vec<ST, SN> flatten() const CMT_NOEXCEPT { return v; } + KFR_MEM_INTRINSIC static vec from_flatten(const vec<ST, SN>& x) { return vec(x.v); } + + KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); } + + constexpr static size_t simd_element_size = const_min(vector_width<T>, N); + constexpr static size_t simd_element_count = N / simd_element_size; + using simd_element_type = simd<ST, simd_element_size>; + +public: + union { + simd_type v; + vec_halves<T, N> h; + simd_element_type w[simd_element_count]; + T s[N]; + }; +}; + +template <typename T, size_t N, size_t... indices> +KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevector(const vec<T, N>& x, + csizes_t<indices...> i) CMT_NOEXCEPT +{ + return intrinsics::simd_shuffle(intrinsics::simd_t<T, N>{}, x.v, i, overload_auto); +} + +template <typename T, size_t N, size_t... indices> +KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevectors(const vec<T, N>& x, const vec<T, N>& y, + csizes_t<indices...> i) CMT_NOEXCEPT +{ + return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, i, overload_auto); +} + +namespace internal +{ + +#if 0 +constexpr inline size_t scale_get_index(size_t counter, size_t groupsize, size_t index) CMT_NOEXCEPT +{ + return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index); +} + +#ifdef CMT_COMPILER_MSVC +template <size_t counter, size_t groupsize, size_t... indices> +constexpr inline size_t scale_get_index(csizes_t<indices...>) CMT_NOEXCEPT +{ + return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>())); +} + +template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)> +constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) CMT_NOEXCEPT + -> csizes_t<scale_get_index<counter, groupsize>(ind)...> +{ + return {}; +} +#else + +template <size_t counter, size_t groupsize, size_t... indices> +constexpr inline size_t scale_get_index() CMT_NOEXCEPT +{ + return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>())); +} + +template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)> +constexpr inline auto scale_impl(csizes_t<indices...>, csizes_t<counter...>) CMT_NOEXCEPT + -> csizes_t<scale_get_index<counter, groupsize, indices...>()...> +{ + return {}; +} + +#endif +#endif + +} // namespace internal + +template <size_t groupsize, size_t... indices> +constexpr inline auto scale() CMT_NOEXCEPT +{ + return cconcat(csizeseq<groupsize, groupsize * indices>...); + // return internal::scale_impl(csizes_t<indices...>(), csizeseq<sizeof...(indices) * groupsize>); +} + +namespace internal +{ +template <typename T> +struct is_vec_impl : std::false_type +{ +}; + +template <typename T, size_t N> +struct is_vec_impl<vec<T, N>> : std::true_type +{ +}; +} // namespace internal + +template <typename T> +using is_vec = internal::is_vec_impl<T>; + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast") + +template <size_t N, typename T> +constexpr KFR_INTRINSIC vec<T, N> broadcast(T x) +{ + return x; +} + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +namespace internal +{ + +template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>, + size_t Nout = (N * compound_type_traits<To>::deep_width)> +constexpr KFR_INTRINSIC vec<To, N> builtin_convertvector(const vec<From, N>& value) CMT_NOEXCEPT +{ + return vec<To, N>(value); +} + +// vector to vector +template <typename To, typename From, size_t N, size_t N2> +struct conversion<vec<To, N>, vec<From, N2>> +{ + static_assert(N == N2, ""); + static_assert(!is_compound<To>::value, ""); + static_assert(!is_compound<From>::value, ""); + + static vec<To, N> cast(const vec<From, N>& value) { return vec<To, N>(value); } +}; + +// scalar to vector +template <typename To, typename From, size_t N> +struct conversion<vec<To, N>, From> +{ + static_assert(std::is_convertible<From, To>::value, ""); + + static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); } +}; +} // namespace internal + +template <typename T> +constexpr size_t size_of() CMT_NOEXCEPT +{ + return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width; +} + +template <typename From, size_t N, typename Tsub = deep_subtype<From>, + size_t Nout = N* size_of<From>() / size_of<Tsub>()> +constexpr KFR_INTRINSIC vec<Tsub, Nout> flatten(const vec<From, N>& x) CMT_NOEXCEPT +{ + return x.flatten(); +} + +template <typename To, typename From, + typename Tout = typename compound_type_traits<From>::template deep_rebind<To>> +constexpr KFR_INTRINSIC Tout cast(const From& value) CMT_NOEXCEPT +{ + return static_cast<Tout>(value); +} + +template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC vec<Tout, N> cast(const vec<Tin, N>& value) CMT_NOEXCEPT +{ + return vec<Tout, N>(value); +} + +template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC vec<vec<Tout, N1>, N2> cast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT +{ + return vec<vec<Tout, N1>, N2>(value); +} + +template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC const vec<Tin, N>& cast(const vec<Tin, N>& value) CMT_NOEXCEPT +{ + return value; +} + +template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC const vec<vec<Tin, N1>, N2>& cast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT +{ + return value; +} + +// + +template <typename To, typename From, + typename Tout = typename compound_type_traits<From>::template deep_rebind<To>> +constexpr KFR_INTRINSIC Tout innercast(const From& value) CMT_NOEXCEPT +{ + return static_cast<Tout>(value); +} + +template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC vec<Tout, N> innercast(const vec<Tin, N>& value) CMT_NOEXCEPT +{ + return vec<Tout, N>(value); +} + +template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC vec<vec<Tout, N1>, N2> innercast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT +{ + return vec<vec<Tout, N1>, N2>(value); +} + +template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC const vec<Tin, N>& innercast(const vec<Tin, N>& value) CMT_NOEXCEPT +{ + return value; +} + +template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC const vec<vec<Tin, N1>, N2>& innercast(const vec<vec<Tin, N1>, N2>& value) + CMT_NOEXCEPT +{ + return value; +} + +// + +template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC vec<Tout, N> elemcast(const vec<Tin, N>& value) CMT_NOEXCEPT +{ + return vec<Tout, N>(value); +} + +template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC const vec<Tin, N>& elemcast(const vec<Tin, N>& value) CMT_NOEXCEPT +{ + return value; +} + +template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)> +constexpr KFR_INTRINSIC vec<Tout, N2> elemcast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT +{ + return vec<Tout, N2>(value); +} + +template <typename To, typename From> +CMT_GNU_CONSTEXPR KFR_INTRINSIC To bitcast(const From& value) CMT_NOEXCEPT +{ + static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types"); + union { + From from; + To to; + } u{ value }; + return u.to; +} + +template <typename To, typename From, size_t N, size_t Nout = (N * size_of<From>() / size_of<To>())> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<To, Nout> bitcast(const vec<From, N>& value) CMT_NOEXCEPT +{ + return vec<To, Nout>::frombits(value); +} + +template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> +constexpr KFR_INTRINSIC To ubitcast(const From& value) CMT_NOEXCEPT +{ + return bitcast<To>(value); +} + +template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> +constexpr KFR_INTRINSIC To ibitcast(const From& value) CMT_NOEXCEPT +{ + return bitcast<To>(value); +} + +template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> +constexpr KFR_INTRINSIC To fbitcast(const From& value) CMT_NOEXCEPT +{ + return bitcast<To>(value); +} + +template <typename From, typename To = uitype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> +constexpr KFR_INTRINSIC To uibitcast(const From& value) CMT_NOEXCEPT +{ + return bitcast<To>(value); +} + +template <typename From, size_t N, typename To = utype<From>, + size_t Nout = size_of<From>() * N / size_of<To>()> +constexpr KFR_INTRINSIC vec<To, Nout> ubitcast(const vec<From, N>& value) CMT_NOEXCEPT +{ + return vec<To, Nout>::frombits(value); +} + +template <typename From, size_t N, typename To = itype<From>, + size_t Nout = size_of<From>() * N / size_of<To>()> +constexpr KFR_INTRINSIC vec<To, Nout> ibitcast(const vec<From, N>& value) CMT_NOEXCEPT +{ + return vec<To, Nout>::frombits(value); +} + +template <typename From, size_t N, typename To = ftype<From>, + size_t Nout = size_of<From>() * N / size_of<To>()> +constexpr KFR_INTRINSIC vec<To, Nout> fbitcast(const vec<From, N>& value) CMT_NOEXCEPT +{ + return vec<To, Nout>::frombits(value); +} + +template <typename From, size_t N, typename To = uitype<From>, + size_t Nout = size_of<From>() * N / size_of<To>()> +constexpr KFR_INTRINSIC vec<To, Nout> uibitcast(const vec<From, N>& value) CMT_NOEXCEPT +{ + return vec<To, Nout>::frombits(value); +} + +constexpr KFR_INTRINSIC size_t vector_alignment(size_t size) { return next_poweroftwo(size); } + +template <typename T, size_t N> +struct pkd_vec +{ + constexpr pkd_vec() CMT_NOEXCEPT {} + + pkd_vec(const vec<T, N>& value) CMT_NOEXCEPT { value.write(v); } + + template <typename... Ts> + constexpr pkd_vec(Ts... init) CMT_NOEXCEPT : v{ static_cast<T>(init)... } + { + static_assert(N <= sizeof...(Ts), "Too few initializers for pkd_vec"); + } + +private: + T v[N]; + friend struct vec<T, N>; +} +#ifdef CMT_GNU_ATTRIBUTES +__attribute__((packed)) +#endif +; + +namespace internal +{ + +template <size_t, typename T> +constexpr KFR_INTRINSIC T make_vector_get_n() +{ + return T(); +} + +template <size_t index, typename T, typename... Args> +constexpr KFR_INTRINSIC T make_vector_get_n(const T& arg, const Args&... args) +{ + return index == 0 ? arg : make_vector_get_n<index - 1, T>(args...); +} + +template <typename T, typename... Args, size_t... indices, size_t N = sizeof...(Args)> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> make_vector_impl(csizes_t<indices...>, const Args&... args) +{ + static_assert(sizeof...(indices) == sizeof...(Args), ""); + const T list[] = { static_cast<T>(args)... }; + return vec<T, N>(list[indices]...); +} +} // namespace internal + +/// Create vector from scalar values +/// @code +/// CHECK( make_vector( 1, 2, 3, 4 ) == i32x4{1, 2, 3, 4} ); +/// @endcode +template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1), + typename SubType = fix_type<conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>> +constexpr KFR_INTRINSIC vec<SubType, N> make_vector(const Arg& x, const Args&... rest) +{ + // static_assert(! is_same<SubType, unsigned long long>::value, "!!!--1"); + // static_assert(! is_same<fix_type<SubType>, unsigned long long>::value, "!!!--2"); + return internal::make_vector_impl<SubType>(cvalseq_t<size_t, N>(), static_cast<SubType>(x), + static_cast<SubType>(rest)...); +} + +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> make_vector(const vec<T, N>& x) +{ + return x; +} + +template <typename T, T... Values, size_t N = sizeof...(Values)> +constexpr KFR_INTRINSIC vec<T, N> make_vector(cvals_t<T, Values...>) +{ + return make_vector<T>(Values...); +} + +template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1), + typename SubType = fix_type<conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>, + KFR_ENABLE_IF(is_number<subtype<SubType>>::value)> +constexpr KFR_INTRINSIC vec<SubType, N> pack(const Arg& x, const Args&... rest) +{ + return internal::make_vector_impl<SubType>(csizeseq<N>, static_cast<SubType>(x), + static_cast<SubType>(rest)...); +} + +using f32x1 = vec<f32, 1>; +using f32x2 = vec<f32, 2>; +using f32x3 = vec<f32, 3>; +using f32x4 = vec<f32, 4>; +using f32x8 = vec<f32, 8>; +using f32x16 = vec<f32, 16>; +using f32x32 = vec<f32, 32>; +using f32x64 = vec<f32, 64>; +using f64x1 = vec<f64, 1>; +using f64x2 = vec<f64, 2>; +using f64x3 = vec<f64, 3>; +using f64x4 = vec<f64, 4>; +using f64x8 = vec<f64, 8>; +using f64x16 = vec<f64, 16>; +using f64x32 = vec<f64, 32>; +using f64x64 = vec<f64, 64>; +using i8x1 = vec<i8, 1>; +using i8x2 = vec<i8, 2>; +using i8x3 = vec<i8, 3>; +using i8x4 = vec<i8, 4>; +using i8x8 = vec<i8, 8>; +using i8x16 = vec<i8, 16>; +using i8x32 = vec<i8, 32>; +using i8x64 = vec<i8, 64>; +using i16x1 = vec<i16, 1>; +using i16x2 = vec<i16, 2>; +using i16x3 = vec<i16, 3>; +using i16x4 = vec<i16, 4>; +using i16x8 = vec<i16, 8>; +using i16x16 = vec<i16, 16>; +using i16x32 = vec<i16, 32>; +using i16x64 = vec<i16, 64>; +using i32x1 = vec<i32, 1>; +using i32x2 = vec<i32, 2>; +using i32x3 = vec<i32, 3>; +using i32x4 = vec<i32, 4>; +using i32x8 = vec<i32, 8>; +using i32x16 = vec<i32, 16>; +using i32x32 = vec<i32, 32>; +using i32x64 = vec<i32, 64>; +using i64x1 = vec<i64, 1>; +using i64x2 = vec<i64, 2>; +using i64x3 = vec<i64, 3>; +using i64x4 = vec<i64, 4>; +using i64x8 = vec<i64, 8>; +using i64x16 = vec<i64, 16>; +using i64x32 = vec<i64, 32>; +using i64x64 = vec<i64, 64>; +using u8x1 = vec<u8, 1>; +using u8x2 = vec<u8, 2>; +using u8x3 = vec<u8, 3>; +using u8x4 = vec<u8, 4>; +using u8x8 = vec<u8, 8>; +using u8x16 = vec<u8, 16>; +using u8x32 = vec<u8, 32>; +using u8x64 = vec<u8, 64>; +using u16x1 = vec<u16, 1>; +using u16x2 = vec<u16, 2>; +using u16x3 = vec<u16, 3>; +using u16x4 = vec<u16, 4>; +using u16x8 = vec<u16, 8>; +using u16x16 = vec<u16, 16>; +using u16x32 = vec<u16, 32>; +using u16x64 = vec<u16, 64>; +using u32x1 = vec<u32, 1>; +using u32x2 = vec<u32, 2>; +using u32x3 = vec<u32, 3>; +using u32x4 = vec<u32, 4>; +using u32x8 = vec<u32, 8>; +using u32x16 = vec<u32, 16>; +using u32x32 = vec<u32, 32>; +using u32x64 = vec<u32, 64>; +using u64x1 = vec<u64, 1>; +using u64x2 = vec<u64, 2>; +using u64x3 = vec<u64, 3>; +using u64x4 = vec<u64, 4>; +using u64x8 = vec<u64, 8>; +using u64x16 = vec<u64, 16>; +using u64x32 = vec<u64, 32>; +using u64x64 = vec<u64, 64>; + +namespace glsl_names +{ +using vec2 = f32x2; +using vec3 = f32x3; +using vec4 = f32x4; +using dvec2 = f64x2; +using dvec3 = f64x3; +using dvec4 = f64x4; +using ivec2 = i32x2; +using ivec3 = i32x3; +using ivec4 = i32x4; +using uvec2 = u32x2; +using uvec3 = u32x3; +using uvec4 = u32x4; +} // namespace glsl_names +namespace opencl_names +{ +using char2 = i8x2; +using char3 = i8x3; +using char4 = i8x4; +using char8 = i8x8; +using char16 = i8x16; +using uchar2 = u8x2; +using uchar3 = u8x3; +using uchar4 = u8x4; +using uchar8 = u8x8; +using uchar16 = u8x16; + +using short2 = i16x2; +using short3 = i16x3; +using short4 = i16x4; +using short8 = i16x8; +using short16 = i16x16; +using ushort2 = u16x2; +using ushort3 = u16x3; +using ushort4 = u16x4; +using ushort8 = u16x8; +using ushort16 = u16x16; + +using int2 = i32x2; +using int3 = i32x3; +using int4 = i32x4; +using int8 = i32x8; +using int16 = i32x16; +using uint2 = u32x2; +using uint3 = u32x3; +using uint4 = u32x4; +using uint8 = u32x8; +using uint16 = u32x16; + +using long2 = i64x2; +using long3 = i64x3; +using long4 = i64x4; +using long8 = i64x8; +using long16 = i64x16; +using ulong2 = u64x2; +using ulong3 = u64x3; +using ulong4 = u64x4; +using ulong8 = u64x8; +using ulong16 = u64x16; + +using float2 = f32x2; +using float3 = f32x3; +using float4 = f32x4; +using float8 = f32x8; +using float16 = f32x16; + +using double2 = f64x2; +using double3 = f64x3; +using double4 = f64x4; +using double8 = f64x8; +using double16 = f64x16; +} // namespace opencl_names + +namespace internal +{ + +template <size_t Index, typename T, size_t N, typename Fn, typename... Args, + typename Tout = result_of<Fn(subtype<decay<Args>>...)>> +constexpr KFR_INTRINSIC Tout applyfn_helper(Fn&& fn, Args&&... args) +{ + return fn(args[Index]...); +} + +template <typename T, size_t N, typename Fn, typename... Args, + typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices> +constexpr KFR_INTRINSIC vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args) +{ + return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...); +} + +template <typename T, size_t N, typename Fn, size_t... Indices> +constexpr KFR_INTRINSIC vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>) +{ + return make_vector(((void)Indices, void(), fn())...); +} +} // namespace internal + +template <typename T, size_t N, typename Fn, typename... Args, + typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>> +constexpr KFR_INTRINSIC vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args) +{ + return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>, arg, std::forward<Args>(args)...); +} + +template <typename T, typename Fn, typename... Args, typename Tout = result_of<Fn(T, decay<Args>...)>, + KFR_ENABLE_IF(is_same<T, subtype<T>>::value)> +constexpr KFR_INTRINSIC Tout apply(Fn&& fn, const T& arg, Args&&... args) +{ + return fn(arg, args...); +} + +template <size_t N, typename Fn, typename T = result_of<Fn()>> +constexpr KFR_INTRINSIC vec<T, N> apply(Fn&& fn) +{ + return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>); +} + +template <typename T, size_t N> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector() +{ + return vec<T, N>(czeros); +} + +template <typename T, size_t N> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector(vec_shape<T, N>) +{ + return vec<T, N>(czeros); +} + +template <typename T, size_t N> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector(vec<T, N>) +{ + return vec<T, N>(czeros); +} + +template <typename T, size_t N> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector() +{ + return vec<T, N>(cones); +} + +template <typename T, size_t N> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector(vec_shape<T, N>) +{ + return vec<T, N>(cones); +} + +template <typename T, size_t N> +CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector(vec<T, N>) +{ + return vec<T, N>(cones); +} + +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> undefinedvector() +{ + return vec<T, N>{}; +} + +template <typename T, size_t N> +constexpr KFR_INTRINSIC vec<T, N> undefinedvector(vec_shape<T, N>) +{ + return undefinedvector<T, N>(); +} + +template <size_t N> +struct vec_template +{ + template <typename T> + using type = vec<T, N>; +}; + +#ifdef KFR_TESTING + +inline const std::vector<special_value>& special_values() +{ + static const std::vector<special_value> values{ special_constant::infinity, + special_constant::neg_infinity, + special_constant::min, + special_constant::lowest, + special_constant::max, + 3.1415926535897932384626433832795, + 4.499999, + 4.500001, + -4.499999, + -4.500001, + 0.1111111111111111111111111111111, + -0.4444444444444444444444444444444, + -1, + 0, + +1 }; + return values; +} + +namespace test_catogories +{ +constexpr cint_t<1> scalars{}; +constexpr cint_t<2> vectors{}; +constexpr cint_t<3> all{}; + +constexpr inline auto types(cint_t<0>) { return ctypes_t<>{}; } +constexpr inline auto types(cint_t<1>) { return cconcat(numeric_types); } +constexpr inline auto types(cint_t<2>) { return cconcat(numeric_vector_types<vec>); } +constexpr inline auto types(cint_t<3>) { return cconcat(numeric_types, numeric_vector_types<vec>); } + +} // namespace test_catogories + +template <typename T, size_t N, size_t... indices> +vec<T, N> test_enumerate(vec_shape<T, N>, csizes_t<indices...>, double start = 0, double step = 1) +{ + return make_vector<T>(static_cast<T>(start + step * indices)...); +} + +template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_return_constant<bool, true>> +void test_function1(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{}) +{ + testo::matrix( + named("type") = test_catogories::types(cat), named("value") = special_values(), + [&](auto type, special_value value) { + using T = type_of<decltype(type)>; + if (isapplicable(ctype<T>, value)) + { + const T x(value); + CHECK(std::is_same<decltype(fn(x)), typename compound_type_traits<T>::template rebind< + decltype(reffn(std::declval<subtype<T>>()))>>::value); + CHECK(fn(x) == apply(reffn, x)); + } + }); + + testo::matrix(named("type") = test_catogories::types(cint<Cat & ~1>), [&](auto type) { + using T = type_of<decltype(type)>; + const T x = test_enumerate(T::shape(), csizeseq<T::size()>, 0); + CHECK(fn(x) == apply(reffn, x)); + }); +} + +template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_return_constant<bool, true>> +void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{}) +{ + testo::matrix( + named("type") = test_catogories::types(cat), + named("value1") = special_values(), // + named("value2") = special_values(), [&](auto type, special_value value1, special_value value2) { + using T = type_of<decltype(type)>; + const T x1(value1); + const T x2(value2); + if (isapplicable(ctype<T>, value1, value2)) + { + CHECK(std::is_same<decltype(fn(x1, x2)), + typename compound_type_traits<T>::template rebind<decltype(reffn( + std::declval<subtype<T>>(), std::declval<subtype<T>>()))>>::value); + CHECK(fn(x1, x2) == apply(reffn, x1, x2)); + } + }); + + testo::matrix(named("type") = test_catogories::types(cint<Cat & ~1>), [&](auto type) { + using T = type_of<decltype(type)>; + const T x1 = test_enumerate(T::shape(), csizeseq<T::size()>, 0, 1); + const T x2 = test_enumerate(T::shape(), csizeseq<T::size()>, 100, -1); + CHECK(fn(x1, x2) == apply(reffn, x1, x2)); + }); +} + +#endif + +namespace internal +{ +// vector<vector> to vector<vector> +template <typename To, typename From, size_t N1, size_t N2, size_t Ns1> +struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>> +{ + static_assert(N1 == Ns1, ""); + static_assert(!is_compound<To>::value, ""); + static_assert(!is_compound<From>::value, ""); + static vec<vec<To, N1>, N2> cast(const vec<From, N1>& value) + { + return vec<vec<To, N1>, N2>::from_flatten( + kfr::innercast<To>(value.flatten()) + .shuffle(csizeseq<N2 * vec<From, N1>::scalar_size()> % csize<N2>)); + } +}; +// vector<vector> to vector<vector> +template <typename To, typename From, size_t N1, size_t N2, size_t NN1, size_t NN2> +struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>> +{ + static_assert(N1 == NN1, ""); + static_assert(N2 == NN2, ""); + static_assert(!is_compound<To>::value, ""); + static_assert(!is_compound<From>::value, ""); + static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value) + { + return vec<vec<To, N1>, N2>::from_flatten(kfr::innercast<To>(value.flatten())); + } +}; +} // namespace internal + +template <typename T, size_t N1, size_t N2 = N1> +using mat = vec<vec<T, N1>, N2>; + +using u8x2x2 = vec<vec<u8, 2>, 2>; +using i8x2x2 = vec<vec<i8, 2>, 2>; +using u16x2x2 = vec<vec<u16, 2>, 2>; +using i16x2x2 = vec<vec<i16, 2>, 2>; +using u32x2x2 = vec<vec<u32, 2>, 2>; +using i32x2x2 = vec<vec<i32, 2>, 2>; +using u64x2x2 = vec<vec<u64, 2>, 2>; +using i64x2x2 = vec<vec<i64, 2>, 2>; +using f32x2x2 = vec<vec<f32, 2>, 2>; +using f64x2x2 = vec<vec<f64, 2>, 2>; + +using u8x4x4 = vec<vec<u8, 4>, 4>; +using i8x4x4 = vec<vec<i8, 4>, 4>; +using u16x4x4 = vec<vec<u16, 4>, 4>; +using i16x4x4 = vec<vec<i16, 4>, 4>; +using u32x4x4 = vec<vec<u32, 4>, 4>; +using i32x4x4 = vec<vec<i32, 4>, 4>; +using u64x4x4 = vec<vec<u64, 4>, 4>; +using i64x4x4 = vec<vec<i64, 4>, 4>; +using f32x4x4 = vec<vec<f32, 4>, 4>; +using f64x4x4 = vec<vec<f64, 4>, 4>; + +template <size_t N1, size_t N2> +struct vec_vec_template +{ + template <typename T> + using type = vec<vec<T, N1>, N2>; +}; + +} // namespace CMT_ARCH_NAME +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::vec<T1, N>, kfr::vec<T2, N>> + : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<kfr::vec<T1, N>, T2> + : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type> +{ +}; +template <typename T1, typename T2, size_t N> +struct common_type_impl<T1, kfr::vec<T2, N>> + : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type> +{ +}; + +template <typename T1, typename T2, size_t N1, size_t N2> +struct common_type_impl<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>> + : common_type_from_subtypes<T1, T2, kfr::vec_vec_template<N1, N2>::template type> +{ + using type = kfr::vec<kfr::vec<typename common_type_impl<T1, T2>::type, N1>, N2>; +}; +template <typename T1, typename T2, size_t N1, size_t N2> +struct common_type_impl<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>> + : common_type_from_subtypes<T1, T2, kfr::vec_vec_template<N1, N2>::template type> +{ +}; +} // namespace kfr + +namespace cometa +{ + +template <typename T, size_t N> +struct compound_type_traits<kfr::vec_shape<T, N>> +{ + constexpr static size_t width = N; + constexpr static size_t deep_width = width * compound_type_traits<T>::width; + using subtype = T; + using deep_subtype = cometa::deep_subtype<T>; + constexpr static bool is_scalar = false; + constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; + + template <typename U> + using rebind = kfr::vec_shape<U, N>; + template <typename U> + using deep_rebind = kfr::vec_shape<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; +}; + +template <typename T, size_t N> +struct compound_type_traits<kfr::vec<T, N>> +{ + using subtype = T; + using deep_subtype = cometa::deep_subtype<T>; + constexpr static size_t width = N; + constexpr static size_t deep_width = width * compound_type_traits<T>::width; + constexpr static bool is_scalar = false; + constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; + template <typename U> + using rebind = kfr::vec<U, N>; + template <typename U> + using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; + + KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::vec<T, N>& value, size_t index) + { + return value[index]; + } +}; + +namespace details +{ +template <typename T, size_t N> +struct flt_type_impl<kfr::vec<T, N>> +{ + using type = kfr::vec<typename flt_type_impl<T>::type, N>; +}; +} // namespace details +} // namespace cometa + +CMT_PRAGMA_GNU(GCC diagnostic pop) +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/testo/assert.hpp b/include/kfr/testo/assert.hpp @@ -1,4 +1,7 @@ -#pragma once +/** @addtogroup testo + * @{ + */ +#pragma once #include "comparison.hpp" diff --git a/include/kfr/testo/comparison.hpp b/include/kfr/testo/comparison.hpp @@ -1,4 +1,7 @@ -#pragma once +/** @addtogroup testo + * @{ + */ +#pragma once #include "../cometa/tuple.hpp" @@ -26,7 +29,7 @@ struct comparison R right; Fn cmp; - comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)) {} + comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)), cmp() {} bool operator()() const { return cmp(left, right); } }; @@ -53,28 +56,51 @@ CMT_PRAGMA_GNU(GCC diagnostic push) CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal") template <typename T> -inline T& epsilon() +inline T& current_epsilon() { static T value = std::numeric_limits<T>::epsilon(); return value; } +template <typename T> +struct eplison_scope +{ + eplison_scope(T scale) { current_epsilon<T>() = std::numeric_limits<T>::epsilon() * scale; } + ~eplison_scope() { current_epsilon<T>() = saved; } + T saved = current_epsilon<T>(); +}; + +template <> +struct eplison_scope<void> +{ + eplison_scope(float scale) : f(scale), d(scale), ld(scale) {} + eplison_scope<float> f; + eplison_scope<double> d; + eplison_scope<long double> ld; +}; + template <> struct equality_comparer<float, float> { - bool operator()(const float& l, const float& r) const { return !(std::abs(l - r) > epsilon<float>()); } + bool operator()(const float& l, const float& r) const + { + return !(std::abs(l - r) > current_epsilon<float>()); + } }; template <> struct equality_comparer<double, double> { - bool operator()(const double& l, const double& r) const { return !(std::abs(l - r) > epsilon<double>()); } + bool operator()(const double& l, const double& r) const + { + return !(std::abs(l - r) > current_epsilon<double>()); + } }; template <> struct equality_comparer<long double, long double> { bool operator()(const long double& l, const long double& r) const { - return !(std::abs(l - r) > epsilon<long double>()); + return !(std::abs(l - r) > current_epsilon<long double>()); } }; diff --git a/include/kfr/testo/console_colors.hpp b/include/kfr/testo/console_colors.hpp @@ -0,0 +1,166 @@ +#pragma once +#include <cstdint> +#include <cstdio> + +//#define CONSOLE_COLORS_FORCE_ASCII + +#if defined _WIN32 && !defined PRINT_COLORED_FORCE_ASCII +#define USE_WIN32_API +#endif + +#if defined(USE_WIN32_API) + +namespace win32_lite +{ +typedef void* HANDLE; +typedef uint32_t DWORD; + +#define WIN32_LITE_STD_INPUT_HANDLE ((win32_lite::DWORD)-10) +#define WIN32_LITE_STD_OUTPUT_HANDLE ((win32_lite::DWORD)-11) +#define WIN32_LITE_STD_ERROR_HANDLE ((win32_lite::DWORD)-12) + +#define WIN32_LITE_ENABLE_VIRTUAL_TERMINAL_PROCESSING (4) + +#define WIN32_LITE_DECLSPEC_IMPORT __declspec(dllimport) + +#define WIN32_LITE_WINAPI __stdcall + +typedef short SHORT; +typedef unsigned short WORD; +typedef int WINBOOL; + +extern "C" +{ + WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI GetConsoleMode(HANDLE hConsole, DWORD* dwMode); + WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleMode(HANDLE hConsole, DWORD dwMode); + WIN32_LITE_DECLSPEC_IMPORT HANDLE WIN32_LITE_WINAPI GetStdHandle(DWORD nStdHandle); + WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleTextAttribute(HANDLE hConsoleOutput, + WORD wAttributes); +} +} // namespace win32_lite + +#endif + +namespace console_colors +{ + +enum text_color : uint32_t +{ + Black = 0x00, + DarkBlue = 0x01, + DarkGreen = 0x02, + DarkCyan = 0x03, + DarkRed = 0x04, + DarkMagenta = 0x05, + DarkYellow = 0x06, + LightGrey = 0x07, + Gray = 0x08, + Blue = 0x09, + Green = 0x0A, + Cyan = 0x0B, + Red = 0x0C, + Magenta = 0x0D, + Yellow = 0x0E, + White = 0x0F, + BgBlack = 0x00, + BgDarkBlue = 0x10, + BgDarkGreen = 0x20, + BgDarkCyan = 0x30, + BgDarkRed = 0x40, + BgDarkMagenta = 0x50, + BgDarkYellow = 0x60, + BgLightGrey = 0x70, + BgGray = 0x80, + BgBlue = 0x90, + BgGreen = 0xA0, + BgCyan = 0xB0, + BgRed = 0xC0, + BgMagenta = 0xD0, + BgYellow = 0xE0, + BgWhite = 0xF0, + + Normal = BgBlack | LightGrey +}; + +enum console_buffer +{ + ConsoleStdOutput, + ConsoleStdError +}; + +struct console_color +{ +public: + console_color(text_color c, console_buffer console = ConsoleStdOutput) + : m_old(get(console)), m_console(console) + { + set(c, m_console); + } + + ~console_color() { set(m_old, m_console); } + +private: + text_color get(console_buffer = ConsoleStdOutput) { return saved_color(); } + + void set(text_color new_color, console_buffer console = ConsoleStdOutput) + { +#ifdef USE_WIN32_API + win32_lite::SetConsoleTextAttribute(win32_lite::GetStdHandle(console == ConsoleStdOutput + ? WIN32_LITE_STD_OUTPUT_HANDLE + : WIN32_LITE_STD_ERROR_HANDLE), + static_cast<win32_lite::WORD>(new_color)); +#else + if (new_color != Normal) + { + uint8_t t = new_color & 0xF; + uint8_t b = (new_color & 0xF0) >> 4; + uint8_t tnum = 30 + ((t & 1) << 2 | (t & 2) | (t & 4) >> 2); + uint8_t bnum = 40 + ((b & 1) << 2 | (b & 2) | (b & 4) >> 2); + if (t & 8) + tnum += 60; + if (b & 8) + bnum += 60; + std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[%d;%dm", tnum, bnum); + } + else + { + std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[0m"); + } +#endif + saved_color() = new_color; + } + + text_color m_old; + console_buffer m_console; + static text_color& saved_color() + { + static text_color color = Normal; + return color; + } +}; + +template <text_color color, console_buffer console = ConsoleStdOutput> +struct console_color_tpl : public console_color +{ +public: + console_color_tpl() : console_color(color, console) {} + +private: +}; + +typedef console_color_tpl<DarkBlue> darkblue_text; +typedef console_color_tpl<DarkGreen> darkgreen_text; +typedef console_color_tpl<DarkCyan> darkcyan_text; +typedef console_color_tpl<DarkRed> darkred_text; +typedef console_color_tpl<DarkMagenta> darkmagenta_text; +typedef console_color_tpl<DarkYellow> darkyellow_text; +typedef console_color_tpl<LightGrey> lightgrey_text; +typedef console_color_tpl<Gray> gray_text; +typedef console_color_tpl<Blue> blue_text; +typedef console_color_tpl<Green> green_text; +typedef console_color_tpl<Cyan> cyan_text; +typedef console_color_tpl<Red> red_text; +typedef console_color_tpl<Magenta> magenta_text; +typedef console_color_tpl<Yellow> yellow_text; +typedef console_color_tpl<White> white_text; +} // namespace console_colors diff --git a/include/kfr/testo/double_double.hpp b/include/kfr/testo/double_double.hpp @@ -0,0 +1,170 @@ +#pragma once + +#include <algorithm> +#include <bitset> +#include <cmath> +#include <cstring> + +struct precise_fp +{ + int sign; // 1 means '+', -1 means '-', can't be 0 + int exponent; // unbiased, INT_MIN means 0/denormal, INT_MAX means inf/nan + uint64_t mantissa; // with explicit first bit set, 63 significant bits + + bool is_zero() const { return exponent == INT_MIN && mantissa == 0; } + bool is_denormal() const { return exponent == INT_MIN && mantissa != 0; } + bool is_inf() const { return exponent == INT_MAX && mantissa == 0; } + bool is_nan() const { return exponent == INT_MAX && mantissa != 0; } + + double to_double() const { return sign * std::ldexp(static_cast<double>(mantissa), exponent); } + float to_float() const { return sign * std::ldexp(static_cast<float>(mantissa), exponent); } + + precise_fp(int sign, int exponent, uint64_t mantissa) : sign(sign), exponent(exponent), mantissa(mantissa) + { + } + + template <typename T> + explicit precise_fp(T value) + { + sign = static_cast<int>(std::copysign(T(1), value)); + if (value == 0) + { + mantissa = 0; + exponent = INT_MIN; + } + else if (std::isinf(value)) + { + mantissa = 0; + exponent = INT_MAX; + } + else if (std::isnan(value)) + { + mantissa = 1; + exponent = INT_MAX; + } + else + { + mantissa = 0x80000000'00000000ull * std::frexp(value, &exponent); + } + } + + friend double precise_ulps(const precise_fp& x, const float& y) + { + return precise_ulps(x, precise_fp(y), -126, 24); + } + friend double precise_ulps(const precise_fp& x, const double& y) + { + return precise_ulps(x, precise_fp(y), -1022, 53); + } + + friend double precise_ulps(const precise_fp& x, const precise_fp& y, int minexponent, int mantissabits) + { + if (x.is_zero() && y.is_zero()) + return 0; + if (x.is_nan() && y.is_nan()) + return 0; + if (x.is_inf() && y.is_inf()) + return x.sign == y.sign ? 0 : HUGE_VAL; + if (x.is_zero() && y.is_zero()) + return 0; + + if (x.sign != y.sign) + return HUGE_VAL; + uint64_t xx = x.mantissa; + uint64_t yy = y.mantissa; + const int minexp = std::min(x.exponent, y.exponent); + if (x.exponent - minexp <= 1 && y.exponent - minexp <= 1) + { + xx >>= y.exponent - minexp; + yy >>= x.exponent - minexp; + return static_cast<double>(xx > yy ? xx - yy : yy - xx) / (1 << (63 - mantissabits)); + } + return HUGE_VAL; + } +}; + +struct double_double +{ + double hi, lo; + + static_assert(sizeof(double) == 8, ""); + + constexpr double_double(double x) noexcept : hi(x), lo(0.0) {} + constexpr double_double(float x) noexcept : hi(x), lo(0.0) {} + constexpr double_double(double hi, double lo) noexcept : hi(hi + lo), lo((hi - (hi + lo)) + lo) {} + constexpr operator double() const noexcept { return hi + lo; } + constexpr operator float() const noexcept { return hi + lo; } + + constexpr static double abs(double x) noexcept { return x >= 0 ? x : -x; } + + constexpr friend double_double operator-(const double_double& x) noexcept { return { -x.hi, -x.lo }; } + constexpr friend double_double operator+(const double_double& x, const double_double& y) noexcept + { + const double sum = x.hi + y.hi; + return { sum, abs(x.hi) > abs(y.hi) ? (((x.hi - sum) + y.hi) + y.lo) + x.lo + : (((y.hi - sum) + x.hi) + x.lo) + y.lo }; + } + constexpr friend double_double operator-(const double_double& x, const double_double& y) noexcept + { + const double diff = x.hi - y.hi; + return { diff, abs(x.hi) > abs(y.hi) ? (((x.hi - diff) - y.hi) - y.lo) + x.lo + : (((-y.hi - diff) + x.hi) + x.lo) - y.lo }; + } + constexpr friend double_double operator*(const double_double& x, const double_double& y) noexcept + { + const double_double c = mul(x.hi, y.hi); + const double cc = (x.hi * y.lo + x.lo * y.hi) + c.lo; + return { c.hi, cc }; + } + constexpr friend double_double operator/(const double_double& x, const double_double& y) noexcept + { + const double c = x.hi / y.hi; + const double_double u = mul(c, y.hi); + const double cc = ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) / y.hi; + return { c, cc }; + } + + bool isinf() const noexcept { return std::isinf(hi); } + bool isnan() const noexcept { return std::isnan(hi) || std::isnan(lo); } + bool iszero() const noexcept { return hi == 0 && lo == 0; } + + double ulp(float value) const noexcept + { + if (std::isnan(value) && isnan()) + return 0.0; + if (std::isinf(value) && isinf() && (std::copysign(1.0f, value) == std::copysign(1.0, hi))) + return 0.0; + if (value == 0 && iszero()) + return 0.0; + if (std::nexttoward(value, 0.0) == 0.0 && iszero()) + return 1.0; + return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0)); + } + double ulp(double value) const noexcept + { + if (std::isnan(value) && isnan()) + return 0.0; + if (std::isinf(value) && isinf() && (std::copysign(1.0, value) == std::copysign(1.0, hi))) + return 0.0; + if (value == 0 && iszero()) + return 0.0; + if (std::nexttoward(value, 0.0) == 0.0 && iszero()) + return 1.0; + return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0)); + } + +private: + constexpr static double_double splitprec(double x) noexcept + { + const double p = x * 1.34217729e8; + const double h = (x - p) + p; + return { h, x - h }; + } + constexpr static double_double mul(double x, double y) noexcept + { + const double_double xx = splitprec(x); + const double_double yy = splitprec(y); + const double z = x * y; + return { z, ((xx.hi * yy.hi - z) + xx.hi * yy.lo + xx.lo * yy.hi) + xx.lo * yy.lo }; + } +}; diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp @@ -1,4 +1,7 @@ -#pragma once +/** @addtogroup testo + * @{ + */ +#pragma once #include "comparison.hpp" @@ -12,7 +15,8 @@ #include <mpfr/mpfr.hpp> #include <mpfr/mpfr_tostring.hpp> #endif -#include "../ext/console_colors.hpp" +#include "console_colors.hpp" +#include <cassert> #include <chrono> #include <cmath> @@ -21,6 +25,7 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wexit-time-destructors") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpadded") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses") namespace testo { @@ -101,6 +106,15 @@ inline test_case*& active_test() return instance; } +struct scope +{ + std::string text; + test_case* current_test; + scope* parent; + scope(std::string text); + ~scope(); +}; + struct test_case { using test_func = void (*)(); @@ -155,12 +169,14 @@ struct test_case } console_color cc(White); } + subtests.clear(); return !failed; } void check(bool result, const std::string& value, const char* expr) { - subtests.push_back(subtest{ result, as_string(padleft(22, expr), " | ", value), comment }); + subtests.push_back( + subtest{ result, as_string(padleft(22, expr), " | ", value), current_scope_text() }); result ? success++ : failed++; if (show_progress) { @@ -191,43 +207,59 @@ struct test_case check(result, as_string(comparison.left), expr); } - void append_comment(const std::string& text) + struct subtest + { + bool success; + std::string text; + std::string comment; + }; + + void scope_changed() { - comment += text; if (show_progress) { println(); - println(text, ":"); + println(current_scope_text(), ":"); } } - - void set_comment(const std::string& text) + std::string current_scope_text() const { - comment = text; - if (show_progress) + scope* s = this->current_scope; + std::string result; + while (s) { - println(); - println(text, ":"); + if (!result.empty()) + result = "; " + result; + result = s->text + result; + s = s->parent; } + return result; } - struct subtest - { - bool success; - std::string text; - std::string comment; - }; - test_func func; const char* name; std::vector<subtest> subtests; - std::string comment; int success; int failed; double time; bool show_progress; + scope* current_scope = nullptr; }; +inline scope::scope(std::string text) + : text(std::move(text)), current_test(active_test()), parent(current_test->current_scope) +{ + current_test->current_scope = this; + current_test->scope_changed(); +} + +inline scope::~scope() +{ + assert(active_test() == current_test); + assert(current_test->current_scope == this); + current_test->current_scope = parent; +} + template <typename Number> struct statistics { @@ -267,10 +299,10 @@ template <typename Arg0, typename Fn> void matrix(named_arg<Arg0>&& arg0, Fn&& fn) { cforeach(std::forward<Arg0>(arg0.value), [&](auto v0) { - active_test()->set_comment(as_string(arg0.name, " = ", v0)); + scope s(as_string(arg0.name, " = ", v0)); fn(v0); }); - if (active_test()->show_progress) + if (active_test() && active_test()->show_progress) println(); } @@ -278,7 +310,7 @@ template <typename Arg0, typename Arg1, typename Fn> void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, Fn&& fn) { cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), [&](auto v0, auto v1) { - active_test()->set_comment(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1)); + scope s(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1)); fn(v0, v1); }); if (active_test()->show_progress) @@ -290,7 +322,7 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar { cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value), [&](auto v0, auto v1, auto v2) { - active_test()->set_comment( + scope s( as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ", v2)); fn(v0, v1, v2); }); @@ -298,27 +330,53 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar println(); } +template <typename Arg0, typename Arg1, typename Arg2, typename Arg3, typename Fn> +void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& arg2, named_arg<Arg3>&& arg3, + Fn&& fn) +{ + cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value), + std::forward<Arg3>(arg3.value), [&](auto v0, auto v1, auto v2, auto v3) { + scope s(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ", + v2, arg3.name, " = ", v3)); + fn(v0, v1, v2, v3); + }); + if (active_test()->show_progress) + println(); +} + CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show_successful = false) { std::vector<test_case*> success; std::vector<test_case*> failed; + int success_checks = 0; + int failed_checks = 0; for (test_case* t : test_case::tests()) { if (name.empty() || t->name == name) + { t->run(show_successful) ? success.push_back(t) : failed.push_back(t); + success_checks += t->success; + failed_checks += t->failed; + } } printfmt("{}\n", std::string(79, '=')); if (!success.empty()) { console_color cc(Green); printfmt("[{}]", padcenter(11, "SUCCESS", '-')); - printfmt(" {} tests\n", success.size()); + printfmt(" {}/{} tests {}/{} checks\n", success.size(), success.size() + failed.size(), + success_checks, success_checks + failed_checks); } if (!failed.empty()) { console_color cc(Red); printfmt("[{}]", padcenter(11, "ERROR", '-')); - printfmt(" {} tests\n", failed.size()); + printfmt(" {}/{} tests {}/{} checks\n", failed.size(), success.size() + failed.size(), failed_checks, + success_checks + failed_checks); + for (test_case* t : failed) + { + print(" ", t->name, "\n"); + } } return static_cast<int>(failed.size()); } @@ -334,6 +392,13 @@ void assert_is_same_decay() static_assert(std::is_same<cometa::decay<T1>, cometa::decay<T2>>::value, ""); } +template <typename T, size_t NArgs> +struct test_data_entry +{ + T arguments[NArgs]; + T result; +}; + #define TESTO_CHECK(...) \ do \ { \ @@ -354,6 +419,7 @@ void assert_is_same_decay() #define TEST TESTO_TEST #define DTEST TESTO_DTEST #endif + } // namespace testo CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/version.hpp b/include/kfr/version.hpp @@ -25,8 +25,7 @@ */ #pragma once -#include "base/types.hpp" -#include "cpuid/cpuid_auto.hpp" +#include "runtime/cpuid_auto.hpp" namespace kfr { diff --git a/sources.cmake b/sources.cmake @@ -7,99 +7,52 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/all.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/cpuid.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/runtime.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd.hpp ${PROJECT_SOURCE_DIR}/include/kfr/version.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cident.h - ${PROJECT_SOURCE_DIR}/include/kfr/base/abs.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/asin_acos.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/atan.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/kfr.h ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/bitwise.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/clamp.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/comparison.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/compiletime.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/complex.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/constants.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/digitreverse.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/filter.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/fraction.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/function.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/gamma.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/function_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/horizontal.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/hyperbolic.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/logical.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/log_exp.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/min_max.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/modzerobessel.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/operators.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/platform.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/read_write.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/round.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/saturation.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/select.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/shuffle.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_clang.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_intrin.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_x86.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/sin_cos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/sqrt.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/tan.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/types.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/vec.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/intrinsics.h - ${PROJECT_SOURCE_DIR}/include/kfr/base/kfr.h - ${PROJECT_SOURCE_DIR}/include/kfr/base/specializations.i - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/abs.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/asin_acos.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/atan.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/clamp.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/gamma.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/hyperbolic.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/logical.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/log_exp.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/min_max.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/modzerobessel.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/round.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/saturation.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/select.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/sin_cos.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/sqrt.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/tan.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/array.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/cstring.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/ctti.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/function.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/named_arg.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/cometa/numeric.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/range.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/result.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/string.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/tuple.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/cpuid/cpuid.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/cpuid/cpuid_auto.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/data/bitrev.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/data/sincos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/cache.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/convolution.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/fft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/reference_dft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/dft_c.h + ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/bitrev.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/sincos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/bitrev.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-fft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-templates.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-templates.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/ft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad_design.hpp @@ -110,7 +63,6 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fir_design.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/fracdelay.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/goertzel.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dsp/interpolation.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/mixdown.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/oscillators.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/sample_rate_conversion.hpp @@ -120,15 +72,114 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/dsp/waveshaper.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/weighting.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/window.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/ext/console_colors.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/ext/double_double.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/audiofile.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h + ${PROJECT_SOURCE_DIR}/include/kfr/math/abs.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/clamp.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/complex_math.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/gamma.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/hyperbolic.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/interpolation.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/logical.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/log_exp.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/min_max.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/modzerobessel.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/round.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/saturation.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/select.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/sin_cos.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/sqrt.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/tan.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/abs.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/asin_acos.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/atan.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/clamp.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/gamma.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/hyperbolic.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/logical.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/log_exp.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/min_max.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/modzerobessel.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/round.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/saturation.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/select.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sin_cos.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sqrt.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/tan.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid_auto.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/comparison.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/constants.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/digitreverse.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/horizontal.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/mask.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/operators.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/platform.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/read_write.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/shuffle.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/types.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/vec.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_clang.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_generic.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_clang.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i ${PROJECT_SOURCE_DIR}/include/kfr/testo/assert.hpp ${PROJECT_SOURCE_DIR}/include/kfr/testo/comparison.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/testo/console_colors.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/testo/double_double.hpp ${PROJECT_SOURCE_DIR}/include/kfr/testo/testo.hpp ) + + +set( + KFR_DFT_SRC + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-src.cpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f32.cpp + ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f64.cpp +) + + +set( + KFR_IO_SRC + ${PROJECT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp +) + + +set( + KFR_UNITTEST_SRC + ${PROJECT_SOURCE_DIR}/tests/unit/base/conversion.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/base/reduce.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/abs.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/asin_acos.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/atan.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/hyperbolic.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/log_exp.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/min_max.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/round.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/select.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/sin_cos.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/math/tan.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/complex.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/operators.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/shuffle.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/vec.cpp +) + + +\ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -15,12 +15,26 @@ # along with KFR. -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 3.1) add_definitions(-DKFR_TESTING=1) +add_definitions(-DKFR_SRC_DIR=\"${CMAKE_SOURCE_DIR}\") + +# Binary output directories +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/") +if (ENABLE_ASMTEST) + add_executable(asm_test asm_test.cpp) + target_link_libraries(asm_test kfr) + target_set_arch(asm_test PRIVATE avx) + target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED) + + add_custom_command(TARGET asm_test POST_BUILD COMMAND objconv -fyasm $<TARGET_FILE:asm_test>) +endif() + if (NOT ARM) if(MSVC AND NOT CLANG) add_executable(multiarch multiarch.cpp multiarch_fir_sse2.cpp multiarch_fir_avx.cpp) @@ -34,67 +48,96 @@ if (NOT ARM) target_link_libraries(multiarch kfr) endif () -find_package(MPFR) -find_package(GMP) - set(ALL_TESTS_CPP - all_tests.cpp - base_test.cpp - complex_test.cpp - dsp_test.cpp - expression_test.cpp - intrinsic_test.cpp - io_test.cpp - resampler_test.cpp) + base_test.cpp + complex_test.cpp + dsp_test.cpp + expression_test.cpp + intrinsic_test.cpp + io_test.cpp + ${KFR_UNITTEST_SRC}) + +# set(ALL_TESTS_MERGED_CPP all_tests_merged.cpp) if (ENABLE_DFT) list(APPEND ALL_TESTS_CPP dft_test.cpp) endif () +find_package(MPFR) +find_package(GMP) + if (MPFR_FOUND AND GMP_FOUND) - list(APPEND ALL_TESTS_CPP transcendental_test.cpp) -else () - message(STATUS "MPFR is not found. Skipping transcendental_test") + message(STATUS "MPFR is found") + add_executable(generate_data generate_data.cpp) + target_link_libraries(generate_data kfr) + target_include_directories(generate_data PRIVATE ${MPFR_INCLUDE_DIR} ${GMP_INCLUDE_DIR}) + target_link_libraries(generate_data ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) + if (REGENERATE_TESTS) + add_custom_command(TARGET generate_data POST_BUILD + COMMENT "Generating tests..." + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests/data + COMMAND generate_data) + endif () endif () -add_executable(all_tests ${ALL_TESTS_CPP}) +add_executable(all_tests all_tests.cpp ${ALL_TESTS_CPP}) target_compile_definitions(all_tests PRIVATE KFR_NO_MAIN) +target_link_libraries(all_tests kfr use_arch) if (ENABLE_DFT) - target_link_libraries(all_tests kfr kfr_dft) + target_link_libraries(all_tests kfr_dft) endif () -target_link_libraries(all_tests kfr kfr_io) +target_link_libraries(all_tests kfr_io) -if (MPFR_FOUND AND GMP_FOUND) - add_definitions(-DHAVE_MPFR) - include_directories(${MPFR_INCLUDE_DIR} ${GMP_INCLUDE_DIR}) - target_link_libraries(all_tests ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) -endif () +function(add_x86_test ARCH) + set(NAME ${ARCH}) -function(add_x86_test NAME FLAGS) - separate_arguments(FLAGS) - add_executable(all_tests_${NAME} ${ALL_TESTS_CPP} ${KFR_IO_SRC}) + add_executable(all_tests_${NAME} all_tests.cpp ${ALL_TESTS_CPP} ${KFR_IO_SRC}) if (ENABLE_DFT) target_sources(all_tests_${NAME} PRIVATE ${KFR_DFT_SRC}) endif () - target_compile_options(all_tests_${NAME} PRIVATE ${FLAGS}) - target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN) target_link_libraries(all_tests_${NAME} kfr) + target_set_arch(all_tests_${NAME} PRIVATE ${ARCH}) + target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN) target_compile_definitions(all_tests_${NAME} PUBLIC KFR_ENABLE_FLAC=1) - if (MPFR_FOUND AND GMP_FOUND) - target_link_libraries(all_tests_${NAME} ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) + + if (ARCH_TESTS_MULTI) + add_library(all_tests_multiarch_${NAME} STATIC ${ALL_TESTS_MERGED_CPP} ${KFR_IO_SRC}) + if (ENABLE_DFT) + target_sources(all_tests_multiarch_${NAME} PRIVATE ${KFR_DFT_SRC}) + endif () + target_link_libraries(all_tests_multiarch_${NAME} kfr) + target_set_arch(all_tests_multiarch_${NAME} PRIVATE ${ARCH}) + target_compile_definitions(all_tests_multiarch_${NAME} PRIVATE KFR_NO_MAIN) + target_compile_definitions(all_tests_multiarch_${NAME} PUBLIC KFR_ENABLE_FLAC=1) endif () + endfunction() if (ARCH_TESTS) - set (ARCH_RESET "-march=x86-64 -mno-sse3 -mno-ssse3 -mno-sse4.1 -mno-sse4.2 -mno-avx -mno-avx2 -mno-fma -mno-avx512f -mno-avx512cd -mno-avx512bw -mno-avx512dq -mno-avx512vl") - add_x86_test(generic "${ARCH_RESET} -DCMT_FORCE_GENERIC_CPU") - add_x86_test(sse2 "${ARCH_RESET} -msse2") - add_x86_test(sse3 "${ARCH_RESET} -msse3 -mno-avx") - add_x86_test(ssse3 "${ARCH_RESET} -mssse3 -mno-avx") - add_x86_test(sse41 "${ARCH_RESET} -msse4.1 -mno-avx") - add_x86_test(avx "${ARCH_RESET} -msse4.1 -mavx") - add_x86_test(avx2 "${ARCH_RESET} -msse4.1 -mavx2 -mfma") - add_x86_test(avx512 "${ARCH_RESET} -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl") + if (NOT MSVC OR CLANG) + add_x86_test(generic) + endif () + add_x86_test(sse2) + add_x86_test(sse3) + add_x86_test(ssse3) + add_x86_test(sse41) + add_x86_test(avx) + add_x86_test(avx2) + add_x86_test(avx512) + + if (ARCH_TESTS_MULTI) + add_executable(all_tests_multiarch all_tests.cpp) + target_compile_definitions(all_tests_multiarch PRIVATE KFR_MULTI_ARCH) + target_link_libraries(all_tests_multiarch + all_tests_multiarch_sse2 + all_tests_multiarch_sse3 + all_tests_multiarch_ssse3 + all_tests_multiarch_sse41 + all_tests_multiarch_avx + all_tests_multiarch_avx2 + all_tests_multiarch_avx512 + ) + endif () endif() if(USE_SDE) diff --git a/tests/all_tests.cpp b/tests/all_tests.cpp @@ -7,6 +7,24 @@ using namespace kfr; +#ifdef KFR_MULTI_ARCH + +#define FORCE_LINK(arch) \ + namespace arch \ + { \ + extern void force_link(); \ + void (*p)() = &force_link; \ + } + +FORCE_LINK(sse2) +FORCE_LINK(sse3) +FORCE_LINK(ssse3) +FORCE_LINK(sse41) +FORCE_LINK(avx) +FORCE_LINK(avx2) +// FORCE_LINK(avx512) +#endif + int main() { println(library_version(), " running on ", cpu_runtime()); @@ -16,7 +34,7 @@ int main() return -1; } #ifdef HAVE_MPFR - mpfr::scoped_precision p(128); + mpfr::scoped_precision p(64); #endif return testo::run_all(""); } diff --git a/tests/all_tests_merged.cpp b/tests/all_tests_merged.cpp @@ -0,0 +1,25 @@ +#include <kfr/cident.h> + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses") + +#include "auto_test.cpp" + +#include "base_test.cpp" +#include "complex_test.cpp" +#include "dsp_test.cpp" +#include "expression_test.cpp" +#include "intrinsic_test.cpp" +#include "io_test.cpp" +#include "resampler_test.cpp" + +#ifndef KFR_NO_DFT +#include "dft_test.cpp" +#endif + +namespace CMT_ARCH_NAME +{ +void force_link() {} +} // namespace CMT_ARCH_NAME + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp @@ -0,0 +1,213 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ + +#include <kfr/base.hpp> +#include <kfr/io.hpp> +#include <kfr/testo/console_colors.hpp> + +using namespace kfr; + +#define TEST_ASM_8(fn, ty, MACRO) \ + MACRO(fn, ty, 1) \ + MACRO(fn, ty, 2) \ + MACRO(fn, ty, 4) \ + MACRO(fn, ty, 8) \ + MACRO(fn, ty, 16) \ + MACRO(fn, ty, 32) \ + MACRO(fn, ty, 64) + +#define TEST_ASM_16(fn, ty, MACRO) \ + MACRO(fn, ty, 1) \ + MACRO(fn, ty, 2) \ + MACRO(fn, ty, 4) \ + MACRO(fn, ty, 8) \ + MACRO(fn, ty, 16) \ + MACRO(fn, ty, 32) \ + MACRO(fn, ty, 64) + +#define TEST_ASM_32(fn, ty, MACRO) \ + MACRO(fn, ty, 1) \ + MACRO(fn, ty, 2) \ + MACRO(fn, ty, 4) \ + MACRO(fn, ty, 8) \ + MACRO(fn, ty, 16) \ + MACRO(fn, ty, 32) + +#define TEST_ASM_64(fn, ty, MACRO) \ + MACRO(fn, ty, 1) \ + MACRO(fn, ty, 2) \ + MACRO(fn, ty, 4) \ + MACRO(fn, ty, 8) \ + MACRO(fn, ty, 16) + +#ifdef CMT_COMPILER_MSVC +#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT +#else +#define KFR_PUBLIC CMT_PUBLIC_C +#endif + +#define TEST_ASM_VTY1(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x) { r = kfr::fn(x); } + +#define TEST_ASM_VTY1_F(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<flt_type<ty>, n>& r, const vec<ty, n>& x) \ + { \ + r = kfr::fn(x); \ + } + +#define TEST_ASM_VTY2(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y) \ + { \ + r = kfr::fn(x, y); \ + } \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__scalar(vec<ty, n>& r, const vec<ty, n>& x, \ + const ty& y) \ + { \ + r = kfr::fn(x, y); \ + } +#define TEST_ASM_CMP(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(mask<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y) \ + { \ + r = kfr::fn(x, y); \ + } +#define TEST_ASM_SHIFT(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, \ + const vec<utype<ty>, n>& y) \ + { \ + r = kfr::fn(x, y); \ + } +#define TEST_ASM_SHIFT_SCALAR(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__scalar(vec<ty, n>& r, const vec<ty, n>& x, unsigned y) \ + { \ + r = kfr::fn(x, y); \ + } +#define TEST_ASM_VTY3(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y, \ + const vec<ty, n>& z) \ + { \ + r = kfr::fn(x, y, z); \ + } + +#define GEN_ty(n, ty) ty(n) +#define GEN_arg_def(n, ty) ty arg##n +#define GEN_arg(n, ty) arg##n + +#define TEST_ASM_MAKE_VECTOR(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, CMT_GEN_LIST(n, GEN_arg_def, ty)) \ + { \ + r = kfr::fn(CMT_GEN_LIST(n, GEN_arg, ty)); \ + } \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__imm(vec<ty, n>& r) \ + { \ + r = kfr::fn(CMT_GEN_LIST(n, GEN_ty, ty)); \ + } + +#define TEST_ASM_BROADCAST(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, ty x) { r = kfr::fn<n>(x); } + +#define TEST_ASM_HALF1(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n * 2>& x) { r = kfr::fn(x); } + +#define TEST_ASM_DOUBLE2(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x, \ + const vec<ty, n>& y) \ + { \ + r = kfr::fn(x, y); \ + } + +#define TEST_ASM_DOUBLE1(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); } + +#define TEST_ASM_U(fn, MACRO) \ + TEST_ASM_8(fn, u8, MACRO) \ + TEST_ASM_16(fn, u16, MACRO) \ + TEST_ASM_32(fn, u32, MACRO) \ + TEST_ASM_64(fn, u64, MACRO) + +#define TEST_ASM_I(fn, MACRO) \ + TEST_ASM_8(fn, i8, MACRO) \ + TEST_ASM_16(fn, i16, MACRO) \ + TEST_ASM_32(fn, i32, MACRO) \ + TEST_ASM_64(fn, i64, MACRO) + +#define TEST_ASM_F(fn, MACRO) \ + TEST_ASM_32(fn, f32, MACRO) \ + TEST_ASM_64(fn, f64, MACRO) + +#define TEST_ASM_UI(fn, MACRO) TEST_ASM_U(fn, MACRO) TEST_ASM_I(fn, MACRO) + +#define TEST_ASM_UIF(fn, MACRO) TEST_ASM_U(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO) + +#define TEST_ASM_IF(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO) + +TEST_ASM_UIF(add, TEST_ASM_VTY2) + +TEST_ASM_UIF(sub, TEST_ASM_VTY2) + +TEST_ASM_UIF(mul, TEST_ASM_VTY2) + +TEST_ASM_UIF(bitwiseand, TEST_ASM_VTY2) + +TEST_ASM_UIF(equal, TEST_ASM_CMP) + +TEST_ASM_IF(abs, TEST_ASM_VTY1) + +TEST_ASM_IF(sqrt, TEST_ASM_VTY1_F) + +TEST_ASM_IF(neg, TEST_ASM_VTY1) + +TEST_ASM_UIF(bitwisenot, TEST_ASM_VTY1) + +TEST_ASM_UIF(div, TEST_ASM_VTY2) + +TEST_ASM_UIF(bitwiseor, TEST_ASM_VTY2) + +TEST_ASM_UIF(bitwisexor, TEST_ASM_VTY2) + +TEST_ASM_UIF(notequal, TEST_ASM_CMP) + +TEST_ASM_UIF(less, TEST_ASM_CMP) + +TEST_ASM_UIF(greater, TEST_ASM_CMP) + +TEST_ASM_UIF(lessorequal, TEST_ASM_CMP) + +TEST_ASM_UIF(greaterorequal, TEST_ASM_CMP) + +TEST_ASM_UIF(low, TEST_ASM_HALF1) + +TEST_ASM_UIF(high, TEST_ASM_HALF1) + +TEST_ASM_UIF(concat, TEST_ASM_DOUBLE2) + +TEST_ASM_UIF(shl, TEST_ASM_SHIFT) + +TEST_ASM_UIF(shr, TEST_ASM_SHIFT) + +TEST_ASM_UIF(shl, TEST_ASM_SHIFT_SCALAR) + +TEST_ASM_UIF(shr, TEST_ASM_SHIFT_SCALAR) + +TEST_ASM_UIF(duphalfs, TEST_ASM_DOUBLE1) + +TEST_ASM_F(sin, TEST_ASM_VTY1_F) + +TEST_ASM_F(cos, TEST_ASM_VTY1_F) + +TEST_ASM_UIF(sqr, TEST_ASM_VTY1) + +TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR) + +TEST_ASM_UIF(broadcast, TEST_ASM_BROADCAST) + +namespace kfr +{ +#ifdef KFR_SHOW_NOT_OPTIMIZED +CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); } +#endif +} // namespace kfr + +int main() { println(library_version()); } diff --git a/tests/base_test.cpp b/tests/base_test.cpp @@ -6,11 +6,14 @@ #include <kfr/testo/testo.hpp> -#include <kfr/base.hpp> #include <kfr/io.hpp> +#include <kfr/simd.hpp> using namespace kfr; +namespace CMT_ARCH_NAME +{ + TEST(test_basic) { // How to make a vector: @@ -76,359 +79,20 @@ TEST(test_basic) CHECK(odd(numbers1) == vec<int, 4>{ 1, 3, 5, 7 }); CHECK(even(numbers2) == vec<int, 4>{ 100, 102, 104, 106 }); - // * The following command pairs are equivalent: - CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); - CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); - - CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) == - vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); - CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); - - CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) == - vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); - CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) == - vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); - - CHECK(splitpairs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6, 1, 3, 5, 7)); - CHECK(splitpairs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7)); - - CHECK(interleavehalfs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 4, 1, 5, 2, 6, 3, 7)); - CHECK(interleavehalfs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7)); - CHECK(subadd(pack(0, 1, 2, 3, 4, 5, 6, 7), pack(10, 10, 10, 10, 10, 10, 10, 10)) == pack(-10, 11, -8, 13, -6, 15, -4, 17)); CHECK(addsub(pack(0, 1, 2, 3, 4, 5, 6, 7), pack(10, 10, 10, 10, 10, 10, 10, 10)) == pack(10, -9, 12, -7, 14, -5, 16, -3)); - CHECK(broadcast<8>(1) == pack(1, 1, 1, 1, 1, 1, 1, 1)); - CHECK(broadcast<8>(1, 2) == pack(1, 2, 1, 2, 1, 2, 1, 2)); - CHECK(broadcast<8>(1, 2, 3, 4) == pack(1, 2, 3, 4, 1, 2, 3, 4)); - CHECK(broadcast<8>(1, 2, 3, 4, 5, 6, 7, 8) == pack(1, 2, 3, 4, 5, 6, 7, 8)); - - CHECK(even(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6)); - CHECK(odd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 3, 5, 7)); - - CHECK(even<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5)); - CHECK(odd<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(2, 3, 6, 7)); - - CHECK(reverse(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(7, 6, 5, 4, 3, 2, 1, 0)); - CHECK(reverse<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(6, 7, 4, 5, 2, 3, 0, 1)); - CHECK(reverse<4>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(4, 5, 6, 7, 0, 1, 2, 3)); - CHECK(digitreverse4(pack(0.f, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)) == pack(0.f, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); - CHECK(dup(pack(0, 1, 2, 3)) == pack(0, 0, 1, 1, 2, 2, 3, 3)); - CHECK(duphalfs(pack(0, 1, 2, 3)) == pack(0, 1, 2, 3, 0, 1, 2, 3)); - CHECK(dupeven(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 0, 2, 2, 4, 4, 6, 6)); - CHECK(dupodd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 1, 3, 3, 5, 5, 7, 7)); - CHECK(inrange(pack(1, 2, 3), 1, 3) == make_mask<int>(true, true, true)); CHECK(inrange(pack(1, 2, 3), 1, 2) == make_mask<int>(true, true, false)); CHECK(inrange(pack(1, 2, 3), 1, 1) == make_mask<int>(true, false, false)); - - // * Transpose matrix: - const auto sixteen = enumerate<float, 16>(); - CHECK(transpose<4>(sixteen) == vec<float, 16>(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); -} - -TEST(concat) -{ - CHECK(concat(vec<f32, 1>{ 1 }, vec<f32, 2>{ 2, 3 }, vec<f32, 1>{ 4 }, vec<f32, 3>{ 5, 6, 7 }) // - == vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }); -} - -TEST(split) -{ - vec<f32, 1> a1; - vec<f32, 2> a23; - vec<f32, 1> a4; - vec<f32, 3> a567; - split(vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }, a1, a23, a4, a567); - CHECK(a1 == vec<f32, 1>{ 1 }); - CHECK(a23 == vec<f32, 2>{ 2, 3 }); - CHECK(a4 == vec<f32, 1>{ 4 }); - CHECK(a567 == vec<f32, 3>{ 5, 6, 7 }); -} - -TEST(broadcast) -{ - CHECK(broadcast<5>(3.f) == vec<f32, 5>{ 3, 3, 3, 3, 3 }); - CHECK(broadcast<6>(1.f, 2.f) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 }); - CHECK(broadcast<6>(1.f, 2.f, 3.f) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 }); -} - -TEST(resize) -{ - CHECK(resize<5>(make_vector(3.f)) == vec<f32, 5>{ 3, 3, 3, 3, 3 }); - CHECK(resize<6>(make_vector(1.f, 2.f)) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 }); - CHECK(resize<6>(make_vector(1.f, 2.f, 3.f)) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 }); -} - -TEST(make_vector) -{ - const signed char ch = -1; - CHECK(make_vector(1, 2, ch) == vec<i32, 3>{ 1, 2, -1 }); - const i64 v = -100; - CHECK(make_vector(1, 2, v) == vec<i64, 3>{ 1, 2, -100 }); - CHECK(make_vector<i64>(1, 2, ch) == vec<i64, 3>{ 1, 2, -1 }); - CHECK(make_vector<f32>(1, 2, ch) == vec<f32, 3>{ 1, 2, -1 }); - - CHECK(make_vector(f64x2{ 1, 2 }, f64x2{ 10, 20 }) == - vec<vec<f64, 2>, 2>{ f64x2{ 1, 2 }, f64x2{ 10, 20 } }); - CHECK(make_vector(1.f, f32x2{ 10, 20 }) == vec<vec<f32, 2>, 2>{ f32x2{ 1, 1 }, f32x2{ 10, 20 } }); -} - -TEST(apply) -{ - CHECK(apply([](int x) { return x + 1; }, make_vector(1, 2, 3, 4, 5)) == make_vector(2, 3, 4, 5, 6)); - CHECK(apply(fn::sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25)); -} - -TEST(zerovector) -{ - CHECK(zerovector<f32, 3>() == f32x3{ 0, 0, 0 }); - // CHECK(zerovector<i16, 3>() == i16x3{ 0, 0, 0 }); // clang 3.9 (trunk) crashes here - CHECK(zerovector(f64x8{}) == f64x8{ 0, 0, 0, 0, 0, 0, 0, 0 }); -} - -TEST(allonesvector) -{ - CHECK(bitcast<u32>(constants<f32>::allones()) == 0xFFFFFFFFu); - CHECK(bitcast<u64>(constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull); - - CHECK(~allonesvector<f32, 3>() == f32x3{ 0, 0, 0 }); - CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 }); - CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 }); } -TEST(low_high) -{ - CHECK(low(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(1, 2, 3, 4)); - CHECK(high(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(5, 6, 7, 8)); - - CHECK(low(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 4>(1, 2, 3, 4)); - CHECK(high(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 3>(5, 6, 7)); - - CHECK(low(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 4>(1, 2, 3, 4)); - CHECK(high(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 2>(5, 6)); - - CHECK(low(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 4>(1, 2, 3, 4)); - CHECK(high(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 1>(5)); - - CHECK(low(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(1, 2)); - CHECK(high(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(3, 4)); - - CHECK(low(vec<u8, 3>(1, 2, 3)) == vec<u8, 2>(1, 2)); - CHECK(high(vec<u8, 3>(1, 2, 3)) == vec<u8, 1>(3)); - - CHECK(low(vec<u8, 2>(1, 2)) == vec<u8, 1>(1)); - CHECK(high(vec<u8, 2>(1, 2)) == vec<u8, 1>(2)); -} - -#ifdef CMT_COMPILER_CLANG -TEST(matrix) -{ - using i32x2x2 = vec<vec<int, 2>, 2>; - const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } }; - CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } }); - - CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } }); - - i32x2 xy{ 10, 20 }; - i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } }; - xy = hadd(xy * m); - CHECK(xy == i32x2{ 40, 120 }); - - i32x2 xy2{ 10, 20 }; - xy2 = hadd(transpose(xy2 * m)); - CHECK(xy2 == i32x2{ 50, 110 }); -} -#endif - -TEST(is_convertible) -{ - static_assert(std::is_convertible<float, f32x4>::value, ""); - static_assert(std::is_convertible<float, f64x8>::value, ""); - static_assert(std::is_convertible<float, u8x3>::value, ""); - - static_assert(std::is_convertible<u16x4, i32x4>::value, ""); - static_assert(!std::is_convertible<u16x4, i32x3>::value, ""); - static_assert(!std::is_convertible<u16x1, u16x16>::value, ""); - - static_assert(std::is_convertible<float, complex<float>>::value, ""); - static_assert(std::is_convertible<float, complex<double>>::value, ""); - static_assert(std::is_convertible<short, complex<double>>::value, ""); - - static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, ""); - static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, ""); - - static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, ""); - static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, ""); - - testo::assert_is_same<i32x4, common_type<i32x4>>(); - testo::assert_is_same<u32x4, common_type<i32x4, u32x4>>(); - testo::assert_is_same<f64x4, common_type<i32x4, u32x4, f64x4>>(); - - CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f }); - CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 }); - CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 }); - - CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 }); - - CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f }); - CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. }); - CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. }); - - CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) == - vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }); - - CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) == - vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } }); - - CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{ - vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) == - vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } }); -} - -TEST(transcendental) -{ - CHECK(kfr::sin(1.0f) == 0.8414709848078965066525023216303f); - CHECK(kfr::sin(1.0) == 0.8414709848078965066525023216303); - - CHECK(kfr::cos(1.0f) == 0.54030230586813971740093660744298f); - CHECK(kfr::cos(1.0) == 0.54030230586813971740093660744298); - - CHECK(kfr::tan(1.0f) == 1.5574077246549022305069748074584f); - CHECK(kfr::tan(1.0) == 1.5574077246549022305069748074584); - - CHECK(kfr::asin(0.45f) == 0.46676533904729636185033976030414f); - CHECK(kfr::asin(0.45) == 0.46676533904729636185033976030414); - - CHECK(kfr::acos(0.45f) == 1.1040309877476002573809819313356f); - CHECK(kfr::acos(0.45) == 1.1040309877476002573809819313356); - - CHECK(kfr::atan(0.45f) == 0.42285392613294071296648279098114f); - CHECK(kfr::atan(0.45) == 0.42285392613294071296648279098114); - - CHECK(kfr::sinh(1.0f) == 1.1752011936438014568823818505956f); - CHECK(kfr::sinh(1.0) == 1.1752011936438014568823818505956); - - CHECK(kfr::cosh(1.0f) == 1.5430806348152437784779056207571f); - CHECK(kfr::cosh(1.0) == 1.5430806348152437784779056207571); - - CHECK(kfr::tanh(1.0f) == 0.76159415595576488811945828260479f); - CHECK(kfr::tanh(1.0) == 0.76159415595576488811945828260479); - - CHECK(kfr::exp(0.75f) == 2.1170000166126746685453698198371f); - CHECK(kfr::exp(0.75) == 2.1170000166126746685453698198371); - - CHECK(kfr::exp(-0.75f) == 0.47236655274101470713804655094327f); - CHECK(kfr::exp(-0.75) == 0.47236655274101470713804655094327); - - CHECK(kfr::log(2.45f) == 0.89608802455663561677548191074382f); - CHECK(kfr::log(2.45) == 0.89608802455663561677548191074382); -} - -TEST(horner) -{ - CHECK(horner(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 17, 34)); - CHECK(horner_odd(pack(0, 1, 2, 3), 1, 2, 3) == pack(0, 6, 114, 786)); - CHECK(horner_even(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 57, 262)); -} - -TEST(test_stat) -{ - { - univector<float, 5> a({ 1, 2, 3, 4, 5 }); - CHECK(sum(a) == 15); - CHECK(mean(a) == 3); - CHECK(minof(a) == 1); - CHECK(maxof(a) == 5); - CHECK(sumsqr(a) == 55); - CHECK(rms(a) == 3.316624790355399849115f); - CHECK(product(a) == 120); - } - { - univector<double, 5> a({ 1, 2, 3, 4, 5 }); - CHECK(sum(a) == 15); - CHECK(mean(a) == 3); - CHECK(minof(a) == 1); - CHECK(maxof(a) == 5); - CHECK(sumsqr(a) == 55); - CHECK(rms(a) == 3.316624790355399849115); - CHECK(product(a) == 120); - } - { - univector<int, 5> a({ 1, 2, 3, 4, 5 }); - CHECK(sum(a) == 15); - CHECK(mean(a) == 3); - CHECK(minof(a) == 1); - CHECK(maxof(a) == 5); - CHECK(sumsqr(a) == 55); - CHECK(product(a) == 120); - } - { - univector<complex<float>, 5> a({ 1, 2, 3, 4, 5 }); - CHECK(sum(a) == c32{ 15 }); - CHECK(mean(a) == c32{ 3 }); - CHECK(sumsqr(a) == c32{ 55 }); - CHECK(product(a) == c32{ 120 }); - } -} - -TEST(sample_conversion) -{ - CHECK(convert_sample<float>(static_cast<i8>(-127)) == -1.f); - CHECK(convert_sample<float>(static_cast<i8>(0)) == 0.f); - CHECK(convert_sample<float>(static_cast<i8>(127)) == 1.f); - - CHECK(convert_sample<float>(static_cast<i16>(-32767)) == -1.f); - CHECK(convert_sample<float>(static_cast<i16>(0)) == 0.f); - CHECK(convert_sample<float>(static_cast<i16>(32767)) == 1.f); - - CHECK(convert_sample<float>(static_cast<i24>(-8388607)) == -1.f); - CHECK(convert_sample<float>(static_cast<i24>(0)) == 0.f); - CHECK(convert_sample<float>(static_cast<i24>(8388607)) == 1.f); - - CHECK(convert_sample<float>(static_cast<i32>(-2147483647)) == -1.f); - CHECK(convert_sample<float>(static_cast<i32>(0)) == 0.f); - CHECK(convert_sample<float>(static_cast<i32>(2147483647)) == 1.f); - - CHECK(convert_sample<i8>(-1.f) == -127); - CHECK(convert_sample<i8>(0.f) == 0); - CHECK(convert_sample<i8>(1.f) == 127); - - CHECK(convert_sample<i16>(-1.f) == -32767); - CHECK(convert_sample<i16>(0.f) == 0); - CHECK(convert_sample<i16>(1.f) == 32767); - - CHECK(convert_sample<i24>(-1.f) == -8388607); - CHECK(convert_sample<i24>(0.f) == 0); - CHECK(convert_sample<i24>(1.f) == 8388607); - - CHECK(convert_sample<i32>(-1.f) == -2147483647); - CHECK(convert_sample<i32>(0.f) == 0); - CHECK(convert_sample<i32>(1.f) == 2147483647); -} - -TEST(sample_interleave_deinterleave) -{ - const size_t size = 50; - univector2d<float> in; - in.push_back(truncate(counter() * 3.f + 0.f, size)); - in.push_back(truncate(counter() * 3.f + 1.f, size)); - in.push_back(truncate(counter() * 3.f + 2.f, size)); - univector<float> out(size * 3); - interleave(out.data(), (const float* []){ in[0].data(), in[1].data(), in[2].data() }, 3, size); - CHECK(maxof(out - render(counter() * 1.f, out.size())) == 0); - - deinterleave((float* []){ in[0].data(), in[1].data(), in[2].data() }, out.data(), 3, size); - - CHECK(absmaxof(in[0] - render(counter() * 3.f + 0.f, size)) == 0); - CHECK(absmaxof(in[1] - render(counter() * 3.f + 1.f, size)) == 0); - CHECK(absmaxof(in[2] - render(counter() * 3.f + 2.f, size)) == 0); -} +} // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN int main() diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp @@ -11,6 +11,9 @@ using namespace kfr; +namespace CMT_ARCH_NAME +{ + TEST(complex_vector) { const vec<c32, 1> c32x1{ c32{ 0, 1 } }; @@ -68,9 +71,11 @@ TEST(complex_math) { const vec<c32, 1> a{ c32{ 1, 2 } }; const vec<c32, 1> b{ c32{ 3, 4 } }; + CHECK(c32(vec<c32, 1>(2)[0]) == c32{ 2, 0 }); CHECK(a + b == make_vector(c32{ 4, 6 })); CHECK(a - b == make_vector(c32{ -2, -2 })); CHECK(a * b == make_vector(c32{ -5, 10 })); + CHECK(a * vec<c32, 1>(2) == make_vector(c32{ 2, 4 })); CHECK(a * 2 == make_vector(c32{ 2, 4 })); CHECK(a / b == make_vector(c32{ 0.44f, 0.08f })); CHECK(-a == make_vector(c32{ -1, -2 })); @@ -88,8 +93,7 @@ TEST(complex_math) CHECK(cabs(-3.f) == 3.f); CHECK(cabs(make_vector(-3.f)) == make_vector(3.f)); - testo::epsilon<f32>() *= 5; - testo::epsilon<f64>() *= 5; + testo::eplison_scope<void> eps(5); CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773f, 0.634963914784736f }); CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, -0.9888977057628651f }); @@ -176,13 +180,6 @@ TEST(complex_function_expressions) TEST(static_tests) { -#ifdef CMT_ARCH_SSE2 - static_assert(platform<f32, cpu_t::sse2>::vector_width == 4, ""); - static_assert(platform<c32, cpu_t::sse2>::vector_width == 2, ""); - static_assert(platform<i32, cpu_t::sse2>::vector_width == 4, ""); - static_assert(platform<complex<i32>, cpu_t::sse2>::vector_width == 2, ""); -#endif - static_assert(is_numeric<vec<complex<float>, 4>>::value, ""); static_assert(is_numeric_args<vec<complex<float>, 4>>::value, ""); @@ -207,8 +204,9 @@ TEST(static_tests) testo::assert_is_same<kfr::internal::arg<complex<int>>, kfr::internal::expression_scalar<kfr::complex<int>, 1>>(); - testo::assert_is_same<common_type<complex<int>, double>, complex<double>>(); + testo::assert_is_same<kfr::common_type<complex<int>, double>, complex<double>>(); } +} // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN int main() diff --git a/tests/data/acos_double_fuzz b/tests/data/acos_double_fuzz Binary files differ. diff --git a/tests/data/acos_double_narrow b/tests/data/acos_double_narrow Binary files differ. diff --git a/tests/data/acos_float_fuzz b/tests/data/acos_float_fuzz Binary files differ. diff --git a/tests/data/acos_float_narrow b/tests/data/acos_float_narrow Binary files differ. diff --git a/tests/data/asin_double_fuzz b/tests/data/asin_double_fuzz Binary files differ. diff --git a/tests/data/asin_double_narrow b/tests/data/asin_double_narrow Binary files differ. diff --git a/tests/data/asin_float_fuzz b/tests/data/asin_float_fuzz Binary files differ. diff --git a/tests/data/asin_float_narrow b/tests/data/asin_float_narrow Binary files differ. diff --git a/tests/data/atan2_double_fuzz b/tests/data/atan2_double_fuzz Binary files differ. diff --git a/tests/data/atan2_double_narrow b/tests/data/atan2_double_narrow Binary files differ. diff --git a/tests/data/atan2_float_fuzz b/tests/data/atan2_float_fuzz Binary files differ. diff --git a/tests/data/atan2_float_narrow b/tests/data/atan2_float_narrow Binary files differ. diff --git a/tests/data/atan_double_fuzz b/tests/data/atan_double_fuzz Binary files differ. diff --git a/tests/data/atan_double_narrow b/tests/data/atan_double_narrow Binary files differ. diff --git a/tests/data/atan_float_fuzz b/tests/data/atan_float_fuzz Binary files differ. diff --git a/tests/data/atan_float_narrow b/tests/data/atan_float_narrow Binary files differ. diff --git a/tests/data/cbrt_double_fuzz b/tests/data/cbrt_double_fuzz Binary files differ. diff --git a/tests/data/cbrt_double_narrow b/tests/data/cbrt_double_narrow Binary files differ. diff --git a/tests/data/cbrt_float_fuzz b/tests/data/cbrt_float_fuzz Binary files differ. diff --git a/tests/data/cbrt_float_narrow b/tests/data/cbrt_float_narrow Binary files differ. diff --git a/tests/data/cos_double_fuzz b/tests/data/cos_double_fuzz Binary files differ. diff --git a/tests/data/cos_double_narrow b/tests/data/cos_double_narrow Binary files differ. diff --git a/tests/data/cos_float_fuzz b/tests/data/cos_float_fuzz Binary files differ. diff --git a/tests/data/cos_float_narrow b/tests/data/cos_float_narrow Binary files differ. diff --git a/tests/data/cosh_double_fuzz b/tests/data/cosh_double_fuzz Binary files differ. diff --git a/tests/data/cosh_double_narrow b/tests/data/cosh_double_narrow Binary files differ. diff --git a/tests/data/cosh_float_fuzz b/tests/data/cosh_float_fuzz Binary files differ. diff --git a/tests/data/cosh_float_narrow b/tests/data/cosh_float_narrow Binary files differ. diff --git a/tests/data/coth_double_fuzz b/tests/data/coth_double_fuzz Binary files differ. diff --git a/tests/data/coth_double_narrow b/tests/data/coth_double_narrow Binary files differ. diff --git a/tests/data/coth_float_fuzz b/tests/data/coth_float_fuzz Binary files differ. diff --git a/tests/data/coth_float_narrow b/tests/data/coth_float_narrow Binary files differ. diff --git a/tests/data/exp10_double_fuzz b/tests/data/exp10_double_fuzz Binary files differ. diff --git a/tests/data/exp10_double_narrow b/tests/data/exp10_double_narrow Binary files differ. diff --git a/tests/data/exp10_float_fuzz b/tests/data/exp10_float_fuzz Binary files differ. diff --git a/tests/data/exp10_float_narrow b/tests/data/exp10_float_narrow Binary files differ. diff --git a/tests/data/exp2_double_fuzz b/tests/data/exp2_double_fuzz Binary files differ. diff --git a/tests/data/exp2_double_narrow b/tests/data/exp2_double_narrow Binary files differ. diff --git a/tests/data/exp2_float_fuzz b/tests/data/exp2_float_fuzz Binary files differ. diff --git a/tests/data/exp2_float_narrow b/tests/data/exp2_float_narrow Binary files differ. diff --git a/tests/data/exp_double_fuzz b/tests/data/exp_double_fuzz Binary files differ. diff --git a/tests/data/exp_double_narrow b/tests/data/exp_double_narrow Binary files differ. diff --git a/tests/data/exp_float_fuzz b/tests/data/exp_float_fuzz Binary files differ. diff --git a/tests/data/exp_float_narrow b/tests/data/exp_float_narrow Binary files differ. diff --git a/tests/data/gamma_double_fuzz b/tests/data/gamma_double_fuzz Binary files differ. diff --git a/tests/data/gamma_double_narrow b/tests/data/gamma_double_narrow Binary files differ. diff --git a/tests/data/gamma_float_fuzz b/tests/data/gamma_float_fuzz Binary files differ. diff --git a/tests/data/gamma_float_narrow b/tests/data/gamma_float_narrow Binary files differ. diff --git a/tests/data/log10_double_fuzz b/tests/data/log10_double_fuzz Binary files differ. diff --git a/tests/data/log10_double_narrow b/tests/data/log10_double_narrow Binary files differ. diff --git a/tests/data/log10_float_fuzz b/tests/data/log10_float_fuzz Binary files differ. diff --git a/tests/data/log10_float_narrow b/tests/data/log10_float_narrow Binary files differ. diff --git a/tests/data/log2_double_fuzz b/tests/data/log2_double_fuzz Binary files differ. diff --git a/tests/data/log2_double_narrow b/tests/data/log2_double_narrow Binary files differ. diff --git a/tests/data/log2_float_fuzz b/tests/data/log2_float_fuzz Binary files differ. diff --git a/tests/data/log2_float_narrow b/tests/data/log2_float_narrow Binary files differ. diff --git a/tests/data/log_double_fuzz b/tests/data/log_double_fuzz Binary files differ. diff --git a/tests/data/log_double_narrow b/tests/data/log_double_narrow Binary files differ. diff --git a/tests/data/log_float_fuzz b/tests/data/log_float_fuzz Binary files differ. diff --git a/tests/data/log_float_narrow b/tests/data/log_float_narrow Binary files differ. diff --git a/tests/data/sin_double_fuzz b/tests/data/sin_double_fuzz Binary files differ. diff --git a/tests/data/sin_double_narrow b/tests/data/sin_double_narrow Binary files differ. diff --git a/tests/data/sin_float_fuzz b/tests/data/sin_float_fuzz Binary files differ. diff --git a/tests/data/sin_float_narrow b/tests/data/sin_float_narrow Binary files differ. diff --git a/tests/data/sinh_double_fuzz b/tests/data/sinh_double_fuzz Binary files differ. diff --git a/tests/data/sinh_double_narrow b/tests/data/sinh_double_narrow Binary files differ. diff --git a/tests/data/sinh_float_fuzz b/tests/data/sinh_float_fuzz Binary files differ. diff --git a/tests/data/sinh_float_narrow b/tests/data/sinh_float_narrow Binary files differ. diff --git a/tests/data/tan_double_fuzz b/tests/data/tan_double_fuzz Binary files differ. diff --git a/tests/data/tan_double_narrow b/tests/data/tan_double_narrow Binary files differ. diff --git a/tests/data/tan_float_fuzz b/tests/data/tan_float_fuzz Binary files differ. diff --git a/tests/data/tan_float_narrow b/tests/data/tan_float_narrow Binary files differ. diff --git a/tests/data/tanh_double_fuzz b/tests/data/tanh_double_fuzz Binary files differ. diff --git a/tests/data/tanh_double_narrow b/tests/data/tanh_double_narrow Binary files differ. diff --git a/tests/data/tanh_float_fuzz b/tests/data/tanh_float_fuzz Binary files differ. diff --git a/tests/data/tanh_float_narrow b/tests/data/tanh_float_narrow Binary files differ. diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -14,6 +14,9 @@ using namespace kfr; +namespace CMT_ARCH_NAME +{ + #ifdef KFR_NATIVE_F64 constexpr ctypes_t<float, double> dft_float_types{}; #else @@ -25,7 +28,7 @@ TEST(test_convolve) univector<fbase, 5> a({ 1, 2, 3, 4, 5 }); univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 }); univector<fbase> c = convolve(a, b); - CHECK(c.size() == 9); + CHECK(c.size() == 9u); CHECK(rms(c - univector<fbase>({ 0.25, 1., 2.75, 2.5, 3.75, 3.5, 1.5, -4., 7.5 })) < 0.0001); } @@ -44,7 +47,7 @@ TEST(test_correlate) univector<fbase, 5> a({ 1, 2, 3, 4, 5 }); univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 }); univector<fbase> c = correlate(a, b); - CHECK(c.size() == 9); + CHECK(c.size() == 9u); CHECK(rms(c - univector<fbase>({ 1.5, 1., 1.5, 2.5, 3.75, -4., 7.75, 3.5, 1.25 })) < 0.0001); } @@ -87,58 +90,60 @@ TEST(fft_accuracy) #endif println(sizes); - testo::matrix( - named("type") = dft_float_types, // - named("size") = sizes, // - [&gen](auto type, size_t size) { - using float_type = type_of<decltype(type)>; - const double min_prec = 0.000001 * std::log(size) * size; - - for (bool inverse : { false, true }) - { - testo::active_test()->append_comment(inverse ? "complex-inverse" : "complex-direct"); - univector<complex<float_type>> in = - truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); - univector<complex<float_type>> out = in; - univector<complex<float_type>> refout = out; - univector<complex<float_type>> outo = in; - const dft_plan<float_type> dft(size); - univector<u8> temp(dft.temp_size); - - reference_dft(refout.data(), in.data(), size, inverse); - dft.execute(outo, in, temp, inverse); - dft.execute(out, out, temp, inverse); - - const float_type rms_diff_inplace = rms(cabs(refout - out)); - CHECK(rms_diff_inplace < min_prec); - const float_type rms_diff_outofplace = rms(cabs(refout - outo)); - CHECK(rms_diff_outofplace < min_prec); - } - - if (size >= 4 && is_poweroftwo(size)) - { - univector<float_type> in = truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); - - univector<complex<float_type>> out = truncate(scalar(qnan), size); - univector<complex<float_type>> refout = truncate(scalar(qnan), size); - const dft_plan_real<float_type> dft(size); - univector<u8> temp(dft.temp_size); - - testo::active_test()->append_comment("real-direct"); - reference_fft(refout.data(), in.data(), size); - dft.execute(out, in, temp); - float_type rms_diff = rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1))); - CHECK(rms_diff < min_prec); - - univector<float_type> out2(size, 0.f); - testo::active_test()->append_comment("real-inverse"); - dft.execute(out2, out, temp); - out2 = out2 / size; - rms_diff = rms(in - out2); - CHECK(rms_diff < min_prec); - } - }); + testo::matrix(named("type") = dft_float_types, // + named("size") = sizes, // + [&gen](auto type, size_t size) { + using float_type = type_of<decltype(type)>; + const double min_prec = 0.000001 * std::log(size) * size; + + for (bool inverse : { false, true }) + { + testo::scope s(inverse ? "complex-inverse" : "complex-direct"); + univector<complex<float_type>> in = + truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); + univector<complex<float_type>> out = in; + univector<complex<float_type>> refout = out; + univector<complex<float_type>> outo = in; + const dft_plan<float_type> dft(size); + univector<u8> temp(dft.temp_size); + + reference_dft(refout.data(), in.data(), size, inverse); + dft.execute(outo, in, temp, inverse); + dft.execute(out, out, temp, inverse); + + const float_type rms_diff_inplace = rms(cabs(refout - out)); + CHECK(rms_diff_inplace < min_prec); + const float_type rms_diff_outofplace = rms(cabs(refout - outo)); + CHECK(rms_diff_outofplace < min_prec); + } + + if (size >= 4 && is_poweroftwo(size)) + { + univector<float_type> in = + truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); + + univector<complex<float_type>> out = truncate(scalar(qnan), size); + univector<complex<float_type>> refout = truncate(scalar(qnan), size); + const dft_plan_real<float_type> dft(size); + univector<u8> temp(dft.temp_size); + + testo::scope s("real-direct"); + reference_fft(refout.data(), in.data(), size); + dft.execute(out, in, temp); + float_type rms_diff = + rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1))); + CHECK(rms_diff < min_prec); + + univector<float_type> out2(size, 0.f); + s.text = "real-inverse"; + dft.execute(out2, out, temp); + out2 = out2 / size; + rms_diff = rms(in - out2); + CHECK(rms_diff < min_prec); + } + }); } +} // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN int main() diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp @@ -15,6 +15,9 @@ using namespace kfr; +namespace CMT_ARCH_NAME +{ + struct TestFragment { float gain; // dB @@ -235,6 +238,13 @@ TEST(ebu_lra_1_2_3_and_4) }); } +TEST(note_to_hertz) +{ + testo::eplison_scope<void> eps(1000); + CHECK(kfr::note_to_hertz(60) == fbase(261.6255653005986346778499935233)); + CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233))); +} + TEST(delay) { const univector<float, 33> v1 = counter() + 100; @@ -265,7 +275,7 @@ TEST(mixdown) [](size_t i) { return i + i * 2 + 100; }); } -#ifdef CMT_COMPILER_CLANG +#ifdef CMT_COMPILER_CLANG__ TEST(mixdown_stereo) { const univector<double, 21> left = counter(); @@ -289,29 +299,85 @@ TEST(phasor) TEST(fir) { - const univector<double, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5); - const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 }; - - CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> double { - double result = 0.0; - for (size_t i = 0; i < taps.size(); i++) - result += data.get(index - i, 0.0) * taps[i]; - return result; - }); +#ifdef CMT_COMPILER_MSVC + // testo::matrix causes error in MSVC + { + using T = float; + + const univector<T, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5); + const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 }; + + CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T { + T result = 0; + for (size_t i = 0; i < taps.size(); i++) + result += data.get(index - i, 0) * taps[i]; + return result; + }); + + CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T { + T result = 0; + for (size_t i = 0; i < taps.size(); i++) + result += data.get(index - i, 0) * taps[i]; + return result; + }); + } + { + using T = double; + + const univector<T, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5); + const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 }; + + CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T { + T result = 0; + for (size_t i = 0; i < taps.size(); i++) + result += data.get(index - i, 0) * taps[i]; + return result; + }); + + CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T { + T result = 0; + for (size_t i = 0; i < taps.size(); i++) + result += data.get(index - i, 0) * taps[i]; + return result; + }); + } +#else + testo::matrix(named("type") = ctypes_t<float +#ifdef KFR_NATIVE_F64 + , + double +#endif + >{}, + [](auto type) { + using T = type_of<decltype(type)>; - CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> double { - double result = 0.0; - for (size_t i = 0; i < taps.size(); i++) - result += data.get(index - i, 0.0) * taps[i]; - return result; - }); + const univector<T, 100> data = + counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5); + const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 }; + + CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T { + T result = 0; + for (size_t i = 0; i < taps.size(); i++) + result += data.get(index - i, 0) * taps[i]; + return result; + }); + + CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T { + T result = 0; + for (size_t i = 0; i < taps.size(); i++) + result += data.get(index - i, 0) * taps[i]; + return result; + }); + }); +#endif } #ifdef KFR_NATIVE_F64 TEST(fir_different) { const univector<float, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5f); - const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 }; + // const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 }; + const univector<double, 4> taps{ 1, 2, 3, 4 }; CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> float { double result = 0.0; @@ -375,6 +441,114 @@ TEST(fir_complex) }); } +template <typename E, typename T, size_t size> +void test_ir(E&& e, const univector<T, size>& test_vector) +{ + substitute(e, to_pointer(unitimpulse<T>())); + const univector<T, size> ir = e; + println(absmaxof(ir - test_vector)); +} + +template <typename T, typename... Ts, univector_tag Tag> +inline const univector<T, Tag>& choose_array(const univector<T, Tag>& array, const univector<Ts, Tag>&...) +{ + return array; +} + +template <typename T, typename T2, typename... Ts, univector_tag Tag, KFR_ENABLE_IF(!is_same<T, T2>::value)> +inline const univector<T, Tag>& choose_array(const univector<T2, Tag>&, const univector<Ts, Tag>&... arrays) +{ + return choose_array<T>(arrays...); +} + +TEST(biquad_lowpass1) +{ + testo::matrix(named("type") = ctypes_t<float, double>{}, [](auto type) { + using T = type_of<decltype(type)>; + + const biquad_params<T> bq = biquad_lowpass<T>(0.1, 0.7); + + constexpr size_t size = 32; + + const univector<float, size> test_vector_f32{ + +0x8.9bce2p-7, +0xd.8383ep-6, +0x8.f908dp-5, +0xe.edc21p-6, +0x9.ae104p-6, +0x9.dcc24p-7, + +0xd.50584p-9, -0xf.2668p-13, -0xd.09ca1p-10, -0xe.15995p-10, -0xa.b90d2p-10, -0xc.edea4p-11, + -0xb.f14eap-12, -0xc.2cb44p-14, +0xb.4a4dep-15, +0xb.685dap-14, +0xa.b181fp-14, +0xf.0cb2bp-15, + +0x8.695d6p-15, +0xd.bedd4p-17, +0xf.5474p-20, -0xd.bb266p-19, -0x9.63ca1p-18, -0xf.ca567p-19, + -0xa.5231p-19, -0xa.9e934p-20, -0xe.ab52p-22, +0xa.3c4cp-26, +0xd.721ffp-23, +0xe.ccc1ap-23, + +0xb.5f248p-23, +0xd.d2c9ap-24, + }; + + const univector<double, size> test_vector_f64{ + +0x8.9bce2bf3663e8p-7, +0xd.8384010fdf1dp-6, +0x8.f908e7a36df6p-5, +0xe.edc2332a6d0bp-6, + +0x9.ae104af1da9ap-6, +0x9.dcc235ef68e7p-7, +0xd.5057ee425e05p-9, -0xf.266e42a99aep-13, + -0xd.09cad73642208p-10, -0xe.1599f32a83dp-10, -0xa.b90d8910a117p-10, -0xc.edeaabb890948p-11, + -0xb.f14edbb55383p-12, -0xc.2cb39b86f2dap-14, +0xb.4a506ecff055p-15, +0xb.685edfdb55358p-14, + +0xa.b182e32f8e298p-14, +0xf.0cb3dfd894b2p-15, +0x8.695df725b4438p-15, +0xd.beddc3606b9p-17, + +0xf.547004d20874p-20, -0xd.bb29b25b49b6p-19, -0x9.63cb9187da1dp-18, -0xf.ca588634fc618p-19, + -0xa.52322d320da78p-19, -0xa.9e9420154e4p-20, -0xe.ab51f7b0335ap-22, +0xa.3c6479980e1p-26, + +0xd.7223836599fp-23, +0xe.ccc47ddd18678p-23, +0xb.5f265b1be1728p-23, +0xd.d2cb83f8483f8p-24, + }; + + const univector<T, size> ir = biquad(bq, unitimpulse<T>()); + + CHECK(absmaxof(choose_array<T>(test_vector_f32, test_vector_f64) - ir) == 0); + }); +} + +TEST(biquad_lowpass2) +{ + testo::matrix(named("type") = ctypes_t<float, double>{}, [](auto type) { + using T = type_of<decltype(type)>; + + const biquad_params<T> bq = biquad_lowpass<T>(0.45, 0.2); + + constexpr size_t size = 32; + + const univector<float, size> test_vector_f32{ + +0x8.ce416p-4, +0x8.2979p-4, -0x8.a9d04p-7, +0xe.aeb3p-11, +0x8.204f8p-13, -0x8.20d78p-12, + +0x8.3379p-12, -0xf.83d81p-13, +0xe.8b5c4p-13, -0xd.9ddadp-13, +0xc.bedfcp-13, -0xb.ee123p-13, + +0xb.2a9e5p-13, -0xa.73ac4p-13, +0x9.c86f6p-13, -0x9.2828p-13, +0x8.92229p-13, -0x8.05b7p-13, + +0xf.048ffp-14, -0xe.0e849p-14, +0xd.28384p-14, -0xc.50a9p-14, +0xb.86e56p-14, -0xa.ca0b6p-14, + +0xa.19476p-14, -0x9.73d38p-14, +0x8.d8f64p-14, -0x8.48024p-14, +0xf.80aa2p-15, -0xe.82ad8p-15, + +0xd.94f22p-15, -0xc.b66d9p-15, + }; + + const univector<double, size> test_vector_f64{ + +0x8.ce416c0d31e88p-4, +0x8.2978efe51dafp-4, -0x8.a9d088b81da6p-7, +0xe.aeb56c029358p-11, + +0x8.20492639873ap-13, -0x8.20d4e21aab538p-12, +0x8.3376b2d53b4a8p-12, -0xf.83d3d1c17343p-13, + +0xe.8b584f0dd5ac8p-13, -0xd.9dd740ceaacf8p-13, +0xc.bedc85e7a621p-13, -0xb.ee0f472bf8968p-13, + +0xb.2a9baed1fe6cp-13, -0xa.73a9d1670f4ep-13, +0x9.c86d29d297798p-13, -0x9.2825f4d894088p-13, + +0x8.9220a956d651p-13, -0x8.05b539fdd79e8p-13, +0xf.048cb5194cfa8p-14, -0xe.0e819fa128938p-14, + +0xd.2835957d684cp-14, -0xc.50a69c2a8dc18p-14, +0xb.86e33bbaf3cbp-14, -0xa.ca097058af2cp-14, + +0xa.1945ad1703dcp-14, -0x9.73d1eef7d8b68p-14, +0x8.d8f4df1bb3efp-14, -0x8.48010323c6f7p-14, + +0xf.80a7f5baeeb2p-15, -0xe.82ab94bb68a8p-15, +0xd.94f05f80af008p-15, -0xc.b66c0799b21a8p-15, + }; + + const univector<T, size> ir = biquad(bq, unitimpulse<T>()); + + CHECK(absmaxof(choose_array<T>(test_vector_f32, test_vector_f64) - ir) == 0); + }); +} + +TEST(resampler_test) +{ + const int in_sr = 44100; + const int out_sr = 48000; + const int freq = 100; + auto resampler = sample_rate_converter<fbase>(resample_quality::draft, out_sr, in_sr); + double delay = resampler.get_fractional_delay(); + univector<fbase> out(out_sr / 10); + univector<fbase> in = truncate(sin(c_pi<fbase> * phasor<fbase>(freq, in_sr, 0)), in_sr / 10); + univector<fbase> ref = truncate( + sin(c_pi<fbase> * phasor<fbase>(freq, out_sr, -delay * (static_cast<double>(freq) / out_sr))), + out_sr / 10); + resampler.process(out, in); + + CHECK(rms(slice(out - ref, static_cast<size_t>(ceil(delay * 2)))) < 0.005f); +} +} // namespace CMT_ARCH_NAME + #ifndef KFR_NO_MAIN int main() { diff --git a/tests/ebu_test.cpp b/tests/ebu_test.cpp @@ -1,122 +0,0 @@ -/** - * KFR (http://kfrlib.com) - * Copyright (C) 2016 D Levin - * See LICENSE.txt for details - */ - -#include <kfr/testo/testo.hpp> - -#include <kfr/base.hpp> -#include <kfr/dsp.hpp> -#include <kfr/io.hpp> - -using namespace kfr; - -int main(int argc, char** argv) -{ - if (argc < 3) - { - println("Usage: ebu_test INPUT_IN_F32_RAW_FORMAT CHANNEL_NUMBER"); - return 1; - } - - // Prepare - FILE* f = fopen(argv[1], "rb"); - const int channel_number = atoi(argv[2]); - if (channel_number < 1 || channel_number > 6) - { - println("Incorrect number of channels"); - return 1; - } - fseek(f, 0, SEEK_END); - uintmax_t size = ftell(f); - fseek(f, 0, SEEK_SET); - if (size % (sizeof(float) * channel_number)) - { - println("Incorrect file size"); - return 1; - } - - // Read file - const size_t length = size / (sizeof(float) * channel_number); - univector<float> interleaved(size / sizeof(float)); - size_t read_len = fread(interleaved.data(), 1, size, f); - if (read_len != size) - { - println("Can't read file"); - return 1; - } - - // Deinterleave - univector<univector<float>> data(channel_number, univector<float>(length)); - for (size_t ch = 0; ch < channel_number; ++ch) - { - for (size_t i = 0; i < length; ++i) - { - data[ch][i] = interleaved[i * channel_number + ch]; - } - } - - std::vector<Speaker> speakers; - switch (channel_number) - { - case 1: - speakers = { Speaker::Mono }; - break; - case 2: - speakers = { Speaker::Left, Speaker::Right }; - break; - case 3: - speakers = { Speaker::Left, Speaker::Right, Speaker::Center }; - break; - case 4: - speakers = { Speaker::Left, Speaker::Right, Speaker::LeftSurround, Speaker::RightSurround }; - break; - case 5: - speakers = { Speaker::Left, Speaker::Right, Speaker::Center, Speaker::LeftSurround, - Speaker::RightSurround }; - break; - case 6: - speakers = { Speaker::Left, Speaker::Right, Speaker::Center, - Speaker::LeftSurround, Speaker::RightSurround, Speaker::Lfe }; - break; - } - - ebu_r128<float> loudness(48000, speakers); - - float M, S, I, RL, RH; - float maxM = -HUGE_VALF, maxS = -HUGE_VALF; - for (size_t i = 0; i < length / loudness.packet_size(); i++) - { - std::vector<univector_ref<float>> channels; - for (size_t ch = 0; ch < channel_number; ++ch) - { - channels.push_back(data[ch].slice(i * loudness.packet_size(), loudness.packet_size())); - } - loudness.process_packet(channels); - loudness.get_values(M, S, I, RL, RH); - maxM = std::max(maxM, M); - maxS = std::max(maxS, S); - } - - { - // For file-based measurements, the signal should be followed by at least 1.5 s of silence - std::vector<univector_dyn<float>> channels(channel_number, - univector_dyn<float>(loudness.packet_size())); - for (size_t i = 0; i < 15; ++i) - loudness.process_packet(channels); - float dummyM, dummyS, dummyI; - loudness.get_values(dummyM, dummyS, dummyI, RL, RH); - } - - println(argv[1]); - println("M = ", M); - println("S = ", S); - println("I = ", I); - println("LRA = ", RH - RL); - println("maxM = ", maxM); - println("maxS = ", maxS); - println(); - - return 0; -} diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp @@ -1,5 +0,0 @@ -#include <kfr/all.hpp> - -using namespace kfr; - -int main() {} diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp @@ -13,6 +13,9 @@ using namespace kfr; +namespace CMT_ARCH_NAME +{ + TEST(pack) { const univector<float, 21> v1 = 1 + counter(); @@ -59,6 +62,17 @@ TEST(test_arg_access) CHECK_EXPRESSION(e1, 10, [](size_t i) { return (i == 0 ? 100 : i) + 1; }); } +TEST(to_pointer) +{ + auto e1 = to_pointer(counter<float>()); + + CHECK_EXPRESSION(e1, infinite_size, [](size_t i) { return static_cast<float>(i); }); + + auto e2 = to_pointer(gen_linear(0.f, 1.f)); + + CHECK_EXPRESSION(e2, infinite_size, [](size_t i) { return static_cast<float>(i); }); +} + TEST(test_arg_replace) { univector<float, 10> v1 = counter(); @@ -88,11 +102,11 @@ TEST(placeholders_pointer) TEST(univector_assignment) { univector<int> x = truncate(counter(), 10); - CHECK(x.size() == 10); + CHECK(x.size() == 10u); univector<int> y; y = truncate(counter(), 10); - CHECK(y.size() == 10); + CHECK(y.size() == 10u); } TEST(size_calc) @@ -102,9 +116,9 @@ TEST(size_calc) auto b = slice(counter(), 100); CHECK(b.size() == infinite_size); auto c = slice(counter(), 100, 1000); - CHECK(c.size() == 1000); + CHECK(c.size() == 1000u); auto d = slice(c, 100); - CHECK(d.size() == 900); + CHECK(d.size() == 900u); } TEST(reverse) @@ -126,8 +140,8 @@ TEST(partition) { univector<double, 385> output = zeros(); auto result = partition(output, counter(), 5, 1); - CHECK(result.count == 5); - CHECK(result.chunk_size == 80); + CHECK(result.count == 5u); + CHECK(result.chunk_size == 80u); result(0); CHECK(sum(output) >= fast_range_sum(80 - 1)); @@ -144,8 +158,8 @@ TEST(partition) { univector<double, 385> output = zeros(); auto result = partition(output, counter(), 5, 160); - CHECK(result.count == 3); - CHECK(result.chunk_size == 160); + CHECK(result.count == 3u); + CHECK(result.chunk_size == 160u); result(0); CHECK(sum(output) >= fast_range_sum(160 - 1)); @@ -155,6 +169,7 @@ TEST(partition) CHECK(sum(output) == fast_range_sum(385 - 1)); } } +} // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN int main() diff --git a/tests/generate_data.cpp b/tests/generate_data.cpp @@ -0,0 +1,114 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ +#define _USE_MATH_DEFINES + +#include "mpfr/mpfrplus.hpp" +#include <kfr/cometa.hpp> +#include <kfr/cometa/ctti.hpp> +#include <kfr/cometa/function.hpp> +#include <kfr/io/file.hpp> +#include <random> + +constexpr size_t points = 10000; +constexpr size_t points_2arg = 100; + +constexpr size_t fuzz_points = 10000; +constexpr size_t fuzz_points_2arg = 100; + +using namespace kfr; + +using testo::test_data_entry; + +template <typename T> +struct range_sampler +{ + double min; + double max; + T operator()(size_t i, size_t num) { return static_cast<T>(min + (max - min) * i / (points - 1)); } +}; + +template <typename T> +struct fuzz_sampler +{ + std::mt19937_64 rnd{ 12345 }; + T operator()(size_t i, size_t num) { return bitcast_anything<T>(static_cast<utype<T>>(rnd())); } +}; + +template <typename T, typename Sampler> +void generate_table(const std::shared_ptr<file_writer<test_data_entry<T, 1>>>& writer, + cometa::function<mpfr::number(const mpfr::number&)> func, Sampler&& sampler) +{ + for (size_t i = 0; i < points; i++) + { + test_data_entry<T, 1> entry; + entry.arguments[0] = sampler(i, points); + entry.result = static_cast<T>(func(entry.arguments[0])); + writer->write(entry); + } +} + +template <typename T, typename Sampler> +void generate_table(const std::shared_ptr<file_writer<test_data_entry<T, 2>>>& writer, + cometa::function<mpfr::number(const mpfr::number&, const mpfr::number&)> func, + Sampler&& sampler) +{ + for (size_t i = 0; i < points_2arg; i++) + { + for (size_t j = 0; j < points_2arg; j++) + { + test_data_entry<T, 2> entry; + entry.arguments[0] = sampler(i, points_2arg); + entry.arguments[1] = sampler(j, points_2arg); + entry.result = static_cast<T>(func(entry.arguments[0], entry.arguments[1])); + writer->write(entry); + } + } +} + +template <int args, typename Func> +void generate_test(cint_t<args>, const char* name, const Func& func, double min, double max) +{ + generate_table(open_file_for_writing<test_data_entry<float, args>>(as_string(name, "_float_narrow")), + func, range_sampler<float>{ min, max }); + generate_table(open_file_for_writing<test_data_entry<double, args>>(as_string(name, "_double_narrow")), + func, range_sampler<double>{ min, max }); + + generate_table(open_file_for_writing<test_data_entry<float, args>>(as_string(name, "_float_fuzz")), func, + fuzz_sampler<float>{}); + generate_table(open_file_for_writing<test_data_entry<double, args>>(as_string(name, "_double_fuzz")), + func, fuzz_sampler<double>{}); +} + +int main() +{ + using num = mpfr::number; + mpfr::scoped_precision prec(512); + generate_test(cint<1>, "sin", [](const num& x) { return mpfr::sin(x); }, 0, M_PI * 2); + generate_test(cint<1>, "cos", [](const num& x) { return mpfr::cos(x); }, 0, M_PI * 2); + generate_test(cint<1>, "tan", [](const num& x) { return mpfr::tan(x); }, 0, M_PI); + + generate_test(cint<1>, "asin", [](const num& x) { return mpfr::asin(x); }, 0, 1); + generate_test(cint<1>, "acos", [](const num& x) { return mpfr::acos(x); }, 0, 1); + generate_test(cint<1>, "atan", [](const num& x) { return mpfr::atan(x); }, 0, 1); + generate_test(cint<2>, "atan2", [](const num& x, const num& y) { return mpfr::atan2(x, y); }, 0, 10); + + generate_test(cint<1>, "sinh", [](const num& x) { return mpfr::sinh(x); }, 0, 10 * 2); + generate_test(cint<1>, "cosh", [](const num& x) { return mpfr::cosh(x); }, 0, 10 * 2); + generate_test(cint<1>, "tanh", [](const num& x) { return mpfr::tanh(x); }, 0, 10 * 2); + generate_test(cint<1>, "coth", [](const num& x) { return mpfr::coth(x); }, 0, 10 * 2); + + generate_test(cint<1>, "gamma", [](const num& x) { return mpfr::gamma(x); }, 0, 10); + + generate_test(cint<1>, "log", [](const num& x) { return mpfr::log(x); }, 0, 100); + generate_test(cint<1>, "log2", [](const num& x) { return mpfr::log2(x); }, 0, 100); + generate_test(cint<1>, "log10", [](const num& x) { return mpfr::log10(x); }, 0, 100); + + generate_test(cint<1>, "exp", [](const num& x) { return mpfr::exp(x); }, -10, 10); + generate_test(cint<1>, "exp2", [](const num& x) { return mpfr::exp2(x); }, -10, 10); + generate_test(cint<1>, "exp10", [](const num& x) { return mpfr::exp10(x); }, -10, 10); + + generate_test(cint<1>, "cbrt", [](const num& x) { return mpfr::cbrt(x); }, 0, 1000); +} diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp @@ -7,44 +7,12 @@ #include <kfr/testo/testo.hpp> #include <kfr/base.hpp> -#include <kfr/dsp.hpp> #include <kfr/io.hpp> using namespace kfr; -constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64, i8x3, // - i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x3, // - i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 // -#ifdef KFR_NATIVE_I64 - , - i64x1, i64x2, i64x4, i64x8, i64x16, i64x3 // -#endif - > - signed_types{}; - -constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64, u8x3, // - u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x3, // - u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 // -#ifdef KFR_NATIVE_I64 - , - u64x1, u64x2, u64x4, u64x8, u64x16, u64x3 // -#endif - > - unsigned_types{}; - -constexpr ctypes_t<f32x1, f32x2, f32x4, f32x8, f32x16, f32x3 // -#ifdef KFR_NATIVE_F64 - , - f64x1, f64x2, f64x4, f64x8, f64x16, f64x3 // -#endif - > - float_types{}; - -template <typename T> -inline T ref_abs(T x) +namespace CMT_ARCH_NAME { - return x >= T(0) ? x : -x; -} template <typename T> bool builtin_add_overflow(T x, T y, T* r) @@ -127,43 +95,6 @@ inline T ref_satsub(T x, T y) return result; } -TEST(intrin_select) -{ - testo::matrix(named("type") = cconcat(signed_types, cconcat(unsigned_types, float_types)), [](auto type) { - using Tvec = type_of<decltype(type)>; - using T = subtype<Tvec>; - CHECK(kfr::select(make_mask<T>(false), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(2)); - CHECK(kfr::select(make_mask<T>(true), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(1)); - }); -} - -TEST(intrin_abs) -{ - testo::assert_is_same<decltype(kfr::abs(1)), int>(); - testo::assert_is_same<decltype(kfr::abs(1u)), unsigned int>(); - testo::assert_is_same<decltype(kfr::abs(make_vector(1))), i32x1>(); - testo::assert_is_same<decltype(kfr::abs(make_vector(1, 2))), i32x2>(); - CHECK(kfr::abs(9u) == 9u); - CHECK(kfr::abs(9) == 9); - CHECK(kfr::abs(-9) == 9); - CHECK(kfr::abs(-infinity) == infinity); - CHECK(kfr::abs(make_vector(9)) == make_vector(9)); - CHECK(kfr::abs(make_vector(-9)) == make_vector(9)); - - testo::matrix(named("type") = signed_types, named("value") = std::vector<int>{ -1, 0, +1 }, - [](auto type, int value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x)); - }); - testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ -1, 0, +1 }, - [](auto type, int value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x)); - }); -} - TEST(intrin_sqrt) { testo::assert_is_same<decltype(kfr::sqrt(9)), fbase>(); @@ -175,141 +106,45 @@ TEST(intrin_sqrt) CHECK(kfr::sqrt(-9) == fbase(qnan)); CHECK(kfr::sqrt(make_vector(9)) == make_vector<fbase>(3.0)); CHECK(kfr::sqrt(make_vector(-9)) == make_vector<fbase>(qnan)); - testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ 0, 2, 65536 }, + testo::matrix(named("type") = float_vector_types<vec>, named("value") = std::vector<int>{ 0, 2, 65536 }, [](auto type, int value) { using T = type_of<decltype(type)>; const T x(value); - CHECK(kfr::sqrt(x) == apply([](auto x) { return std::sqrt(x); }, x)); - }); -} - -TEST(intrin_round) -{ - testo::assert_is_same<decltype(kfr::floor(100)), int>(); - testo::assert_is_same<decltype(kfr::ceil(100)), int>(); - testo::assert_is_same<decltype(kfr::round(100)), int>(); - testo::assert_is_same<decltype(kfr::trunc(100)), int>(); - testo::assert_is_same<decltype(kfr::fract(100)), int>(); - - testo::assert_is_same<decltype(kfr::ifloor(100.f)), int>(); - testo::assert_is_same<decltype(kfr::iceil(100.f)), int>(); - testo::assert_is_same<decltype(kfr::iround(100.f)), int>(); - testo::assert_is_same<decltype(kfr::itrunc(100.f)), int>(); - CHECK(kfr::floor(100) == 100); - CHECK(kfr::ceil(100) == 100); - CHECK(kfr::round(100) == 100); - CHECK(kfr::trunc(100) == 100); - CHECK(kfr::fract(100) == 0); - - testo::matrix(named("type") = float_types, - named("value") = std::vector<fbase>{ -1.51, -1.49, 0.0, +1.49, +1.51 }, - [](auto type, fbase value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(kfr::floor(x) == apply([](auto x) { return std::floor(x); }, x)); - CHECK(kfr::ceil(x) == apply([](auto x) { return std::ceil(x); }, x)); - CHECK(kfr::round(x) == apply([](auto x) { return std::round(x); }, x)); - CHECK(kfr::trunc(x) == apply([](auto x) { return std::trunc(x); }, x)); - CHECK(kfr::fract(x) == apply([](auto x) { return x - std::floor(x); }, x)); - }); -} - -TEST(intrin_min_max) -{ - testo::assert_is_same<decltype(min(1, 2)), int>(); - testo::assert_is_same<decltype(min(1, 2u)), unsigned int>(); - testo::assert_is_same<decltype(min(1, 2)), int>(); - testo::assert_is_same<decltype(min(pack(1), 2u)), u32x1>(); - testo::assert_is_same<decltype(min(2u, pack(1))), u32x1>(); - testo::assert_is_same<decltype(min(pack(1), pack(2u))), u32x1>(); - testo::assert_is_same<decltype(min(pack(1, 2, 3), pack(1.0, 2.0, 3.0))), f64x3>(); - testo::assert_is_same<decltype(min(pack(1.0, 2.0, 3.0), pack(1, 2, 3))), f64x3>(); - - CHECK(min(1, 2) == 1); - CHECK(min(1, 2u) == 1u); - CHECK(min(pack(1), 2) == pack(1)); - CHECK(min(pack(1, 2, 3), 2) == pack(1, 2, 2)); - CHECK(min(pack(1., 2., 3.), 2) == pack(1., 2., 2.)); - - testo::matrix(named("type") = float_types, - named("value") = std::vector<std::pair<fbase, fbase>>{ { -100, +100 }, { infinity, 0.0 } }, - [](auto type, std::pair<fbase, fbase> value) { - using T = type_of<decltype(type)>; - const T x(value.first); - const T y(value.second); - CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y)); - CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y)); - CHECK(kfr::absmin(x, y) == - apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y)); - CHECK(kfr::absmax(x, y) == - apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y)); - }); - testo::matrix(named("type") = signed_types, - named("value") = std::vector<std::pair<int, int>>{ { -100, +100 } }, - [](auto type, std::pair<int, int> value) { - using T = type_of<decltype(type)>; - const T x(value.first); - const T y(value.second); - CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y)); - CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y)); - CHECK(kfr::absmin(x, y) == - apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y)); - CHECK(kfr::absmax(x, y) == - apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y)); - }); - testo::matrix(named("type") = unsigned_types, - named("value") = std::vector<std::pair<unsigned, unsigned>>{ { 0, +200 } }, - [](auto type, std::pair<unsigned, unsigned> value) { - using T = type_of<decltype(type)>; - const T x(value.first); - const T y(value.second); - CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y)); - CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y)); - CHECK(kfr::absmin(x, y) == - apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y)); - CHECK(kfr::absmax(x, y) == - apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y)); + CHECK(kfr::sqrt(x) == apply([](auto x) -> decltype(x) { return std::sqrt(x); }, x)); }); } TEST(intrin_satadd_satsub) { - testo::matrix(named("type") = signed_types, [](auto type) { - using T = type_of<decltype(type)>; - using Tsub = subtype<T>; - const T min = std::numeric_limits<Tsub>::min(); - const T max = std::numeric_limits<Tsub>::max(); - CHECK(kfr::satadd(min, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, min)); - CHECK(kfr::satadd(max, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, max)); - CHECK(kfr::satadd(min, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, max)); - CHECK(kfr::satadd(max, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, min)); - - CHECK(kfr::satsub(min, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, min)); - CHECK(kfr::satsub(max, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, max)); - CHECK(kfr::satsub(min, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, max)); - CHECK(kfr::satsub(max, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, min)); - }); - - testo::matrix(named("type") = unsigned_types, [](auto type) { - using T = type_of<decltype(type)>; - using Tsub = subtype<T>; - const T& min = std::numeric_limits<Tsub>::min(); - const T& max = std::numeric_limits<Tsub>::max(); - CHECK(kfr::satadd(min, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, min)); - CHECK(kfr::satadd(max, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, max)); - CHECK(kfr::satadd(min, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, max)); - CHECK(kfr::satadd(max, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, min)); - - CHECK(kfr::satsub(min, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, min)); - CHECK(kfr::satsub(max, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, max)); - CHECK(kfr::satsub(min, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, max)); - CHECK(kfr::satsub(max, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, min)); - }); + testo::matrix(named("type") = cconcat(signed_vector_types<vec>, unsigned_vector_types<vec>), + [](auto type) { + using T = type_of<decltype(type)>; + using Tsub = subtype<T>; + const T min = std::numeric_limits<Tsub>::min(); + const T max = std::numeric_limits<Tsub>::max(); + CHECK(kfr::satadd(min, min) == + apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, min, min)); + CHECK(kfr::satadd(max, max) == + apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, max, max)); + CHECK(kfr::satadd(min, max) == + apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, min, max)); + CHECK(kfr::satadd(max, min) == + apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, max, min)); + + CHECK(kfr::satsub(min, min) == + apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, min, min)); + CHECK(kfr::satsub(max, max) == + apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, max, max)); + CHECK(kfr::satsub(min, max) == + apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, min, max)); + CHECK(kfr::satsub(max, min) == + apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, max, min)); + }); } TEST(intrin_any_all) { - testo::matrix(named("type") = unsigned_types, [](auto type) { + testo::matrix(named("type") = unsigned_vector_types<vec>, [](auto type) { using T = type_of<decltype(type)>; constexpr size_t width = widthof<T>(); using Tsub = subtype<T>; @@ -328,74 +163,7 @@ TEST(intrin_any_all) }); } -TEST(intrin_math) -{ - testo::assert_is_same<decltype(pack(11) * pack(0.5)), f64x1>(); - testo::assert_is_same<decltype(pack(11) * 0.5), f64x1>(); - testo::assert_is_same<decltype(kfr::sin(2)), fbase>(); - testo::assert_is_same<decltype(kfr::sin(pack(2))), vec<fbase, 1>>(); - testo::assert_is_same<decltype(kfr::sindeg(2)), fbase>(); - testo::assert_is_same<decltype(kfr::sindeg(pack(2))), vec<fbase, 1>>(); - - CHECK(pack(11) * pack(0.5) == 5.5); - CHECK(pack(11) * 0.5 == 5.5); - CHECK(kfr::sin(2) == fbase(0.90929742682568169539601986591174)); - CHECK(kfr::sin(pack(2)) == pack(fbase(0.90929742682568169539601986591174))); - CHECK(kfr::sindeg(2) == fbase(0.03489949670250097164599518162533)); - CHECK(kfr::sindeg(pack(2)) == pack(fbase(0.03489949670250097164599518162533))); - CHECK(kfr::cos(2) == fbase(-0.41614683654714238699756822950076)); - CHECK(kfr::cos(pack(2)) == pack(fbase(-0.41614683654714238699756822950076))); - CHECK(kfr::cosdeg(2) == fbase(0.99939082701909573000624344004393)); - CHECK(kfr::cosdeg(pack(2)) == pack(fbase(0.99939082701909573000624344004393))); - - CHECK(kfr::log(2) == fbase(0.6931471805599453)); - CHECK(kfr::log(pack(2)) == pack(fbase(0.6931471805599453))); - CHECK(kfr::log2(2) == fbase(1.0)); - CHECK(kfr::log2(pack(2)) == pack(fbase(1.0))); - CHECK(kfr::log10(2) == fbase(0.30102999566398119521373889472449)); - CHECK(kfr::log10(pack(2)) == pack(fbase(0.30102999566398119521373889472449))); - - CHECK(kfr::exp(2) == fbase(7.3890560989306502)); - CHECK(kfr::exp(pack(2)) == pack(fbase(7.3890560989306502))); - CHECK(kfr::exp2(2) == fbase(4.0)); - CHECK(kfr::exp2(pack(2)) == pack(fbase(4.0))); - - CHECK(kfr::logn(2, 10) == fbase(0.30102999566398119521373889472449)); - CHECK(kfr::logn(pack(2), pack(10)) == pack(fbase(0.30102999566398119521373889472449))); - - CHECK(kfr::pow(2, fbase(0.9)) == fbase(1.8660659830736148319626865322999)); - CHECK(kfr::pow(pack(2), pack(fbase(0.9))) == pack(fbase(1.8660659830736148319626865322999))); - - CHECK(kfr::root(fbase(1.5), 2) == fbase(1.2247448713915890490986420373529)); - CHECK(kfr::root(pack(fbase(1.5)), pack(2)) == pack(fbase(1.2247448713915890490986420373529))); - - testo::epsilon<float>() *= 10.0; - testo::epsilon<double>() *= 10.0; - - CHECK(kfr::sinh(2) == fbase(3.6268604078470187676682139828013)); - CHECK(kfr::sinh(pack(2)) == pack(fbase(3.6268604078470187676682139828013))); - CHECK(kfr::cosh(2) == fbase(3.7621956910836314595622134777737)); - CHECK(kfr::cosh(pack(2)) == pack(fbase(3.7621956910836314595622134777737))); - - CHECK(kfr::tanh(2) == fbase(0.96402758007581688394641372410092)); - CHECK(kfr::tanh(pack(2)) == pack(fbase(0.96402758007581688394641372410092))); - CHECK(kfr::coth(2) == fbase(1.0373147207275480958778097647678)); - CHECK(kfr::coth(pack(2)) == pack(fbase(1.0373147207275480958778097647678))); - - testo::epsilon<float>() *= 10.0; - testo::epsilon<double>() *= 10.0; - - CHECK(kfr::tan(2) == fbase(-2.1850398632615189916433061023137)); - CHECK(kfr::tan(pack(2)) == pack(fbase(-2.1850398632615189916433061023137))); - CHECK(kfr::tandeg(2) == fbase(0.03492076949174773050040262577373)); - CHECK(kfr::tandeg(pack(2)) == pack(fbase(0.03492076949174773050040262577373))); - - testo::epsilon<float>() *= 10.0; - testo::epsilon<double>() *= 10.0; - - CHECK(kfr::note_to_hertz(60) == fbase(261.6255653005986346778499935233)); - CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233))); -} +} // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN int main() diff --git a/tests/io_test.cpp b/tests/io_test.cpp @@ -8,11 +8,13 @@ #include <kfr/base.hpp> #include <kfr/cometa/function.hpp> -#include <kfr/dsp.hpp> #include <kfr/io.hpp> using namespace kfr; +namespace CMT_ARCH_NAME +{ + #if KFR_ENABLE_WAV TEST(write_wav_file) { @@ -22,17 +24,17 @@ TEST(write_wav_file) data = sin(counter() * 0.01f); size_t wr = writer.write(data.data(), data.size()); CHECK(wr == data.size()); - CHECK(writer.format().length == data.size() / 2); + CHECK(umax(writer.format().length) == data.size() / 2); } TEST(read_wav_file) { audio_reader_wav<float> reader(open_file_for_reading(KFR_FILEPATH("temp_audio_file.wav"))); - CHECK(reader.format().channels == 2); + CHECK(reader.format().channels == 2u); CHECK(reader.format().type == audio_sample_type::i16); CHECK(reader.format().samplerate == 44100); univector<float> data(44100 * 2); - CHECK(reader.format().length == data.size() / 2); + CHECK(umax(reader.format().length) == data.size() / 2); size_t rd = reader.read(data.data(), data.size()); CHECK(rd == data.size()); CHECK(absmaxof(data - render(sin(counter() * 0.01f), data.size())) < 0.0001f); @@ -40,10 +42,10 @@ TEST(read_wav_file) #endif #if KFR_ENABLE_FLAC -TEST(read_flac_file) +DTEST(read_flac_file) { audio_reader_flac<float> reader(open_file_for_reading(KFR_FILEPATH("../../tests/test-audio/sine.flac"))); - CHECK(reader.format().channels == 2); + CHECK(reader.format().channels == 2u); CHECK(reader.format().type == audio_sample_type::i32); CHECK(reader.format().samplerate == 44100); univector<float> data(44100 * 2); @@ -53,6 +55,7 @@ TEST(read_flac_file) CHECK(absmaxof(data - render(sin(counter() * 0.01f), data.size())) < 0.0001f); } #endif +} // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN int main() diff --git a/tests/mpfr/mpfrplus.hpp b/tests/mpfr/mpfrplus.hpp @@ -18,6 +18,7 @@ MPFR_DIAG_PRAGMA(ignored "-Wsign-conversion") MPFR_DIAG_PRAGMA(pop) #include <cmath> #include <limits> +#include <string> #include <type_traits> namespace mpfr @@ -47,17 +48,14 @@ constexpr with_precision_t with_precision{}; namespace internal { -#ifndef MPFR_THREAD_LOCAL -#define MPFR_THREAD_LOCAL thread_local -#endif -static mpfr_prec_t& precision() +inline mpfr_prec_t& precision() { - static MPFR_THREAD_LOCAL mpfr_prec_t prec = mpfr_get_default_prec(); + static mpfr_prec_t prec = mpfr_get_default_prec(); return prec; } -static mpfr_rnd_t& rounding_mode() +inline mpfr_rnd_t& rounding_mode() { - static MPFR_THREAD_LOCAL mpfr_rnd_t rnd = mpfr_get_default_rounding_mode(); + static mpfr_rnd_t rnd = mpfr_get_default_rounding_mode(); return rnd; } } // namespace internal @@ -241,7 +239,7 @@ public: MPFR_CXX_CTOR_T(mpfr_set_ui, unsigned int) MPFR_CXX_CTOR_T(mpfr_set_si, long int) MPFR_CXX_CTOR_T(mpfr_set_ui, unsigned long int) -#if __INTMAX_MAX__ != __LONG_MAX__ +#ifdef _MPFR_H_HAVE_INTMAX_T MPFR_CXX_CTOR_T(mpfr_set_sj, intmax_t) MPFR_CXX_CTOR_T(mpfr_set_uj, uintmax_t) #endif @@ -253,7 +251,7 @@ public: MPFR_CXX_ASGN_T(mpfr_set_ui, unsigned int) MPFR_CXX_ASGN_T(mpfr_set_si, long int) MPFR_CXX_ASGN_T(mpfr_set_ui, unsigned long int) -#if __INTMAX_MAX__ != __LONG_MAX__ +#ifdef _MPFR_H_HAVE_INTMAX_T MPFR_CXX_ASGN_T(mpfr_set_sj, intmax_t) MPFR_CXX_ASGN_T(mpfr_set_uj, uintmax_t) #endif @@ -300,6 +298,15 @@ public: { return mpfr_get_ld(val, internal::rounding_mode()); } + + std::string to_string() const + { + char* str; + mpfr_asprintf(&str, "%.*Rg", prec(), val); + std::string result = str; + mpfr_free_str(str); + return result; + } }; #ifdef MPFR_USE_UDL diff --git a/tests/multiarch.cpp b/tests/multiarch.cpp @@ -7,7 +7,6 @@ #include <kfr/testo/testo.hpp> #include <kfr/base.hpp> -#include <kfr/cpuid.hpp> #include <kfr/dsp.hpp> #include <kfr/io.hpp> diff --git a/tests/numeric_tests.hpp b/tests/numeric_tests.hpp @@ -0,0 +1,123 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ + +#include <kfr/io.hpp> +#include <kfr/testo/testo.hpp> + +namespace kfr +{ + +using testo::test_data_entry; + +inline namespace CMT_ARCH_NAME +{ + +using vector_types = + ctypes_t<f32, f32x1, f32x2, f32x4, f32x8, f32x16, f32x32, f64, f64x1, f64x2, f64x4, f64x8, f64x16>; + +template <typename T> +uint64_t ulps(T x, T y) +{ + if (std::abs(x) < std::numeric_limits<T>::min() && std::abs(y) < std::numeric_limits<T>::min()) + return 0; + if (std::isnan(x) && std::isnan(y)) + return 0; + if (std::isinf(x) && std::isinf(y)) + return (x < 0) == (y < 0) ? 0 : ULLONG_MAX; + if (x < 0 && y < 0) + return ulps<T>(-x, -y); + if ((x < 0) != (y < 0)) + return ulps<T>(std::abs(x), 0) + ulps<T>(std::abs(y), 0); + + utype<T> ix = cometa::bitcast_anything<utype<T>>(x); + utype<T> iy = cometa::bitcast_anything<utype<T>>(y); + if (std::abs(x) < std::numeric_limits<T>::min() && y > std::numeric_limits<T>::min()) + return 1 + ulps<T>(std::numeric_limits<T>::min(), y); + if (std::abs(x) > std::numeric_limits<T>::min() && y < std::numeric_limits<T>::min()) + return 1 + ulps<T>(x, std::numeric_limits<T>::min()); + return ix > iy ? ix - iy : iy - ix; +} + +template <typename T, size_t N> +uint64_t ulps(vec<T, N> x, vec<T, N> y) +{ + uint64_t u = 0; + for (size_t i = 0; i < N; i++) + { + u = std::max(u, ulps(x[i], y[i])); + } + return u; +} + +inline const char* tname(ctype_t<f32>) { return "float"; } +inline const char* tname(ctype_t<f64>) { return "double"; } + +#define CHECK_DIFF(x_arg, y_arg, threshold) \ + do \ + { \ + ++checks_count; \ + const auto x_arg_value = x_arg; \ + const auto y_arg_value = y_arg; \ + const auto arg_diff = ulps(x_arg_value, y_arg_value); \ + error_sum += arg_diff; \ + error_peak = std::max(error_peak, arg_diff); \ + ::testo::active_test()->check( \ + arg_diff <= threshold, \ + ::cometa::as_string(x_arg_value, " ~= ", y_arg_value, " (", arg_diff, " <= ", threshold, ")"), \ + #x_arg " ~= " #y_arg); \ + } while (0) + +#define KFR_AUTO_TEST_1(fn, datafile, maxulps, avgulps) \ + TEST(fn##_##datafile) \ + { \ + testo::matrix(named("type") = vector_types(), [&](auto type) { \ + using T = type_of<decltype(type)>; \ + using Tsub = subtype<T>; \ + double error_sum = 0.0; \ + uint64_t error_peak = 0; \ + uint64_t checks_count = 0; \ + std::shared_ptr<file_reader<test_data_entry<Tsub, 1>>> reader = \ + open_file_for_reading<test_data_entry<Tsub, 1>>( \ + std::string(KFR_SRC_DIR "/tests/data/" #fn "_") + tname(ctype<Tsub>) + "_" #datafile); \ + test_data_entry<Tsub, 1> entry; \ + while (reader->read(entry)) \ + { \ + testo::scope s(as_string(entry.arguments[0])); \ + CHECK_DIFF(kfr::fn(entry.arguments[0]), entry.result, maxulps); \ + } \ + CHECK(checks_count > 0u); \ + CHECK(error_sum / checks_count <= avgulps); \ + println("measured accuracy: ", tname(ctype<Tsub>), " ", error_sum / checks_count, "(peak ", \ + error_peak, ")"); \ + }); \ + } + +#define KFR_AUTO_TEST_2(fn, datafile, maxulps, avgulps) \ + TEST(fn##_##datafile) \ + { \ + testo::matrix(named("type") = vector_types(), [&](auto type) { \ + using T = type_of<decltype(type)>; \ + using Tsub = subtype<T>; \ + double error_sum = 0.0; \ + uint64_t error_peak = 0; \ + uint64_t checks_count = 0; \ + std::shared_ptr<file_reader<test_data_entry<Tsub, 2>>> reader = \ + open_file_for_reading<test_data_entry<Tsub, 2>>( \ + std::string(KFR_SRC_DIR "/tests/data/" #fn "_") + tname(ctype<Tsub>) + "_" #datafile); \ + test_data_entry<Tsub, 2> entry; \ + while (reader->read(entry)) \ + { \ + testo::scope s(as_string(entry.arguments[0], entry.arguments[1])); \ + CHECK_DIFF(kfr::fn(entry.arguments[0], entry.arguments[1]), entry.result, maxulps); \ + } \ + CHECK(checks_count > 0u); \ + CHECK(error_sum / checks_count <= avgulps); \ + println("measured accuracy: ", tname(ctype<Tsub>), " ", error_sum / checks_count, "(peak ", \ + error_peak, ")"); \ + }); \ + } +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/resampler_test.cpp b/tests/resampler_test.cpp @@ -1,37 +0,0 @@ -/** - * KFR (http://kfrlib.com) - * Copyright (C) 2016 D Levin - * See LICENSE.txt for details - */ - -#include <kfr/dsp.hpp> -#include <kfr/io.hpp> -#include <kfr/testo/testo.hpp> - -using namespace kfr; - -TEST(resampler_test) -{ - const int in_sr = 44100; - const int out_sr = 48000; - const int freq = 100; - auto resampler = sample_rate_converter<fbase>(resample_quality::draft, out_sr, in_sr); - double delay = resampler.get_fractional_delay(); - univector<fbase> out(out_sr / 10); - univector<fbase> in = truncate(sin(c_pi<fbase> * phasor<fbase>(freq, in_sr, 0)), in_sr / 10); - univector<fbase> ref = truncate( - sin(c_pi<fbase> * phasor<fbase>(freq, out_sr, -delay * (static_cast<double>(freq) / out_sr))), - out_sr / 10); - resampler.process(out, in); - - CHECK(rms(slice(out - ref, ceil(delay * 2))) < 0.005f); -} - -#ifndef KFR_NO_MAIN -int main() -{ - println(library_version()); - - return testo::run_all("", true); -} -#endif diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp @@ -1,172 +0,0 @@ -/** - * KFR (http://kfrlib.com) - * Copyright (C) 2016 D Levin - * See LICENSE.txt for details - */ - -#include <kfr/testo/testo.hpp> - -#include <kfr/base.hpp> -#include <kfr/io.hpp> - -#define MPFR_THREAD_LOCAL -#include "mpfr/mpfrplus.hpp" - -using namespace kfr; - -using vector_types = ctypes_t<f32, f64, f32x2, f32x8, f32x16, f64x2, f64x4, f64x8>; - -template <typename T> -double ulps(T test, const mpfr::number& ref) -{ - if (std::isnan(test) && ref.isnan()) - return 0; - if (std::isinf(test) && ref.isinfinity()) - return (test < 0) == (ref < 0) ? 0 : NAN; - return static_cast<double>(mpfr::abs(mpfr::number(test) - ref) / - mpfr::abs(mpfr::number(test) - std::nexttoward(test, HUGE_VALL))); -} - -template <typename T, size_t N> -double ulps(const vec<T, N>& test, const mpfr::number& ref) -{ - double u = 0; - for (size_t i = 0; i < N; ++i) - u = std::max(u, ulps(test[i], ref)); - return u; -} - -TEST(test_sin_cos) -{ - testo::matrix(named("type") = vector_types(), - named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0); - CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0); - }); - testo::matrix(named("type") = vector_types(), named("value") = make_range(-100.0, 100.0, 0.5), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0); - CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0); - }); -} - -TEST(test_tan) -{ - testo::matrix(named("type") = ctypes_t<f32>(), - named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.01), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 2.0); - }); - testo::matrix(named("type") = ctypes_t<f32>(), named("value") = make_range(-100.0, 100.0, 0.5), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 3.0); - }); -} - -#ifdef __clang__ -#define ARCFN_ULP 2.0 -#else -#define ARCFN_ULP 2.5 -#endif - -TEST(test_asin_acos_atan) -{ - testo::matrix(named("type") = vector_types(), named("value") = make_range(-1.0, 1.0, 0.05), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::asin(x), mpfr::asin(subtype<T>(value))) < ARCFN_ULP); - CHECK(ulps(kfr::acos(x), mpfr::acos(subtype<T>(value))) < ARCFN_ULP); - CHECK(ulps(kfr::atan(x), mpfr::atan(subtype<T>(value))) < ARCFN_ULP); - }); -} - -TEST(test_atan2) -{ - testo::matrix(named("type") = vector_types(), named("value1") = make_range(-1.0, 1.0, 0.1), - named("value2") = make_range(-1.0, 1.0, 0.1), [](auto type, double value1, double value2) { - using T = type_of<decltype(type)>; - const T x(value1); - const T y(value2); - CHECK(ulps(kfr::atan2(x, y), mpfr::atan2(subtype<T>(value1), subtype<T>(value2))) < - ARCFN_ULP); - }); -} - -TEST(test_log) -{ - testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::log(x), mpfr::log(x)) < 2.0); - }); -} - -TEST(test_log2) -{ - testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::log2(x), mpfr::log2(x)) < 3.0); - }); -} - -TEST(test_log10) -{ - testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::log10(x), mpfr::log10(x)) < 3.0); - }); -} - -TEST(test_exp) -{ - testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::exp(x), mpfr::exp(x)) < 2.0); - }); -} - -TEST(test_exp2) -{ - testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::exp2(x), mpfr::exp2(x)) < 3.0); - }); -} - -TEST(test_exp10) -{ - testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05), - [](auto type, double value) { - using T = type_of<decltype(type)>; - const T x(value); - CHECK(ulps(kfr::exp10(x), mpfr::exp10(x)) < 3.0); - }); -} - -#ifndef KFR_NO_MAIN -int main() -{ - println(library_version(), " running on ", cpu_runtime()); - mpfr::scoped_precision p(128); - return testo::run_all(""); -} -#endif diff --git a/tests/unit/base/conversion.cpp b/tests/unit/base/conversion.cpp @@ -0,0 +1,67 @@ +#include <kfr/base/conversion.hpp> + +#include <kfr/base/basic_expressions.hpp> + +#include <kfr/base/reduce.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +TEST(sample_conversion) +{ + CHECK(convert_sample<float>(static_cast<i8>(-127)) == -1.f); + CHECK(convert_sample<float>(static_cast<i8>(0)) == 0.f); + CHECK(convert_sample<float>(static_cast<i8>(127)) == 1.f); + + CHECK(convert_sample<float>(static_cast<i16>(-32767)) == -1.f); + CHECK(convert_sample<float>(static_cast<i16>(0)) == 0.f); + CHECK(convert_sample<float>(static_cast<i16>(32767)) == 1.f); + + CHECK(convert_sample<float>(static_cast<i24>(-8388607)) == -1.f); + CHECK(convert_sample<float>(static_cast<i24>(0)) == 0.f); + CHECK(convert_sample<float>(static_cast<i24>(8388607)) == 1.f); + + CHECK(convert_sample<float>(static_cast<i32>(-2147483647)) == -1.f); + CHECK(convert_sample<float>(static_cast<i32>(0)) == 0.f); + CHECK(convert_sample<float>(static_cast<i32>(2147483647)) == 1.f); + + CHECK(convert_sample<i8>(-1.f) == -127); + CHECK(convert_sample<i8>(0.f) == 0); + CHECK(convert_sample<i8>(1.f) == 127); + + CHECK(convert_sample<i16>(-1.f) == -32767); + CHECK(convert_sample<i16>(0.f) == 0); + CHECK(convert_sample<i16>(1.f) == 32767); + + CHECK(convert_sample<i24>(-1.f) == -8388607); + CHECK(convert_sample<i24>(0.f) == 0); + CHECK(convert_sample<i24>(1.f) == 8388607); + + CHECK(convert_sample<i32>(-1.f) == -2147483647); + CHECK(convert_sample<i32>(0.f) == 0); + CHECK(convert_sample<i32>(1.f) == 2147483647); +} + +TEST(sample_interleave_deinterleave) +{ + const size_t size = 50; + univector2d<float> in; + in.push_back(truncate(counter() * 3.f + 0.f, size)); + in.push_back(truncate(counter() * 3.f + 1.f, size)); + in.push_back(truncate(counter() * 3.f + 2.f, size)); + univector<float> out(size * 3); + interleave(out.data(), std::array<const float*, 3>{ in[0].data(), in[1].data(), in[2].data() }.data(), 3, + size); + CHECK(maxof(out - render(counter() * 1.f, out.size())) == 0); + + deinterleave(std::array<float*, 3>{ in[0].data(), in[1].data(), in[2].data() }.data(), out.data(), 3, + size); + + CHECK(absmaxof(in[0] - render(counter() * 3.f + 0.f, size)) == 0); + CHECK(absmaxof(in[1] - render(counter() * 3.f + 1.f, size)) == 0); + CHECK(absmaxof(in[2] - render(counter() * 3.f + 2.f, size)) == 0); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/base/reduce.cpp b/tests/unit/base/reduce.cpp @@ -0,0 +1,41 @@ +#include <kfr/base/reduce.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +TEST(reduce) +{ + { + univector<float, 5> a({ 1, 2, 3, 4, 5 }); + CHECK(sum(a) == 15); + CHECK(mean(a) == 3); + CHECK(minof(a) == 1); + CHECK(maxof(a) == 5); + CHECK(sumsqr(a) == 55); + CHECK(rms(a) == 3.316624790355399849115f); + CHECK(product(a) == 120); + } + { + univector<double, 5> a({ 1, 2, 3, 4, 5 }); + CHECK(sum(a) == 15); + CHECK(mean(a) == 3); + CHECK(minof(a) == 1); + CHECK(maxof(a) == 5); + CHECK(sumsqr(a) == 55); + CHECK(rms(a) == 3.316624790355399849115); + CHECK(product(a) == 120); + } + { + univector<int, 5> a({ 1, 2, 3, 4, 5 }); + CHECK(sum(a) == 15); + CHECK(mean(a) == 3); + CHECK(minof(a) == 1); + CHECK(maxof(a) == 5); + CHECK(sumsqr(a) == 55); + CHECK(product(a) == 120); + } +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/abs.cpp b/tests/unit/math/abs.cpp @@ -0,0 +1,13 @@ +#include <kfr/math/abs.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(abs) +{ + test_function1(test_catogories::all, [](auto x) { return kfr::abs(x); }, + [](auto x) -> decltype(x) { return x >= 0 ? x : -x; }); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/asin_acos.cpp b/tests/unit/math/asin_acos.cpp @@ -0,0 +1,18 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ +#include "../../numeric_tests.hpp" + +#include <kfr/math/asin_acos.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +KFR_AUTO_TEST_1(asin, narrow, 6, 1) +KFR_AUTO_TEST_1(acos, narrow, 800, 1) +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/tests/unit/math/atan.cpp b/tests/unit/math/atan.cpp @@ -0,0 +1,18 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ + +#include "../../numeric_tests.hpp" + +#include <kfr/math/atan.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +KFR_AUTO_TEST_1(atan, narrow, 2, 1) +KFR_AUTO_TEST_2(atan2, narrow, 2, 1) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/hyperbolic.cpp b/tests/unit/math/hyperbolic.cpp @@ -0,0 +1,21 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ + +#include "../../numeric_tests.hpp" + +#include <kfr/math/hyperbolic.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +KFR_AUTO_TEST_1(sinh, narrow, 114, 2.5) +KFR_AUTO_TEST_1(cosh, narrow, 7, 2.5) +KFR_AUTO_TEST_1(tanh, narrow, 45, 1) +KFR_AUTO_TEST_1(coth, narrow, 85, 1) +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/tests/unit/math/log_exp.cpp b/tests/unit/math/log_exp.cpp @@ -0,0 +1,23 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ +#include "../../numeric_tests.hpp" + +#include <kfr/math/log_exp.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +KFR_AUTO_TEST_1(gamma, narrow, 2200, 321) +KFR_AUTO_TEST_1(exp, narrow, 4, 2) +KFR_AUTO_TEST_1(exp2, narrow, 5, 2) +KFR_AUTO_TEST_1(exp10, narrow, 40, 10) +KFR_AUTO_TEST_1(log, narrow, 2, 1) +KFR_AUTO_TEST_1(log2, narrow, 2, 1) +KFR_AUTO_TEST_1(log10, narrow, 3, 1) +KFR_AUTO_TEST_1(cbrt, narrow, 5, 1) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/min_max.cpp b/tests/unit/math/min_max.cpp @@ -0,0 +1,39 @@ +#include <kfr/math/min_max.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(min) +{ + test_function2(test_catogories::all, [](auto x, auto y) { return kfr::min(x, y); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x <= y ? x : y; }); +} + +TEST(max) +{ + test_function2(test_catogories::all, [](auto x, auto y) { return kfr::max(x, y); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x >= y ? x : y; }); +} + +TEST(absmin) +{ + test_function2(test_catogories::all, [](auto x, auto y) { return kfr::absmin(x, y); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + x = x >= 0 ? x : -x; + y = y >= 0 ? y : -y; + return x <= y ? x : y; + }); +} + +TEST(absmax) +{ + test_function2(test_catogories::all, [](auto x, auto y) { return kfr::absmax(x, y); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + x = x >= 0 ? x : -x; + y = y >= 0 ? y : -y; + return x >= y ? x : y; + }); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/round.cpp b/tests/unit/math/round.cpp @@ -0,0 +1,53 @@ +#include <kfr/math/round.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(floor) +{ + test_function1(test_catogories::all, [](auto x) { return kfr::floor(x); }, + [](auto x) -> decltype(x) { + return std::is_integral<decltype(x)>::value ? x + : static_cast<decltype(x)>(std::floor(x)); + }); +} + +TEST(ceil) +{ + test_function1(test_catogories::all, [](auto x) { return kfr::ceil(x); }, + [](auto x) -> decltype(x) { + return std::is_integral<decltype(x)>::value ? x + : static_cast<decltype(x)>(std::ceil(x)); + }); +} + +TEST(trunc) +{ + test_function1(test_catogories::all, [](auto x) { return kfr::trunc(x); }, + [](auto x) -> decltype(x) { + return std::is_integral<decltype(x)>::value ? x + : static_cast<decltype(x)>(std::trunc(x)); + }); +} + +TEST(round) +{ + test_function1(test_catogories::all, [](auto x) { return kfr::round(x); }, + [](auto x) -> decltype(x) { + return std::is_integral<decltype(x)>::value ? x + : static_cast<decltype(x)>(std::round(x)); + }); +} + +TEST(fract) +{ + test_function1(test_catogories::all, [](auto x) { return kfr::fract(x); }, + [](auto x) -> decltype(x) { + return std::is_integral<decltype(x)>::value + ? 0 + : static_cast<decltype(x)>(x - std::floor(x)); + }); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/select.cpp b/tests/unit/math/select.cpp @@ -0,0 +1,27 @@ +#include <kfr/math/select.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(select_true) +{ + test_function2(test_catogories::vectors, + [](auto x, auto y) { + mask<subtype<decltype(x)>, decltype(x)::scalar_size()> m(true); + return kfr::select(m, x, y); + }, + [](auto x, auto) { return x; }); +} + +TEST(select_false) +{ + test_function2(test_catogories::vectors, + [](auto x, auto y) { + mask<subtype<decltype(x)>, decltype(x)::scalar_size()> m(false); + return kfr::select(m, x, y); + }, + [](auto, auto y) { return y; }); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/sin_cos.cpp b/tests/unit/math/sin_cos.cpp @@ -0,0 +1,17 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ +#include "../../numeric_tests.hpp" + +#include <kfr/math/sin_cos.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +KFR_AUTO_TEST_1(sin, narrow, 7, 1) +KFR_AUTO_TEST_1(cos, narrow, 2, 1) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/math/tan.cpp b/tests/unit/math/tan.cpp @@ -0,0 +1,16 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ +#include "../../numeric_tests.hpp" + +#include <kfr/math/tan.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +KFR_AUTO_TEST_1(tan, narrow, 7, 1) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/simd/complex.cpp b/tests/unit/simd/complex.cpp @@ -0,0 +1,33 @@ +#include <kfr/simd/complex.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(complex_convertible) +{ + static_assert(std::is_convertible<float, complex<float>>::value, ""); + static_assert(std::is_convertible<float, complex<double>>::value, ""); + static_assert(std::is_convertible<short, complex<double>>::value, ""); + + static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, ""); + static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, ""); + + static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, ""); + static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, ""); + + CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f }); + CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. }); + CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. }); + + CHECK(static_cast<vec<complex<float>, 2>>(complex<float>{ 1.f, 2.f }) == + vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }); + + CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) == + vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }); + + CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) == + vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } }); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/simd/operators.cpp b/tests/unit/simd/operators.cpp @@ -0,0 +1,220 @@ +#include <kfr/simd/horizontal.hpp> +#include <kfr/simd/operators.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(neg) +{ + test_function1(test_catogories::vectors, [](auto x) -> decltype(x) { return -x; }, + [](auto x) -> decltype(x) { return -x; }); +} + +TEST(bnot) +{ + test_function1(test_catogories::vectors, [](auto x) -> decltype(x) { return ~x; }, + [](auto x) -> decltype(x) { + utype<decltype(x)> u = ~ubitcast(x); + return bitcast<decltype(x)>(u); + }); +} + +TEST(add) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return x + y; }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x + y; }); +} + +TEST(sub) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return x - y; }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x - y; }); +} + +TEST(mul) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return x * y; }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x * y; }); +} + +template <typename T> +inline bool is_safe_division(T x, T y) +{ + return y != T(0) && !(std::is_signed<T>::value && x == std::numeric_limits<T>::min() && y == T(-1)); +} + +TEST(div) +{ + test_function2(test_catogories::vectors, + [](auto x, auto y) { + return is_safe_division<subtype<decltype(x)>>(x.front(), y.front()) ? x / y : 0; + }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return is_safe_division(x, y) ? x / y : 0; + }); +} + +TEST(bor) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return x | y; }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + using T = common_type<decltype(x), decltype(y)>; + return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) | ubitcast(T(y)))); + }); +} + +TEST(bxor) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return x ^ y; }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + using T = common_type<decltype(x), decltype(y)>; + return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) ^ ubitcast(T(y)))); + }); +} + +TEST(band) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return x & y; }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + using T = common_type<decltype(x), decltype(y)>; + return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) & ubitcast(T(y)))); + }); +} + +TEST(shl) +{ + testo::matrix( + named("type") = test_catogories::types(test_catogories::vectors), named("value1") = special_values(), + named("shift") = std::vector<unsigned>{ 1, 2, 7, 8, 9, 15, 16, 31, 32, 63, 64 }, + [&](auto type, special_value value, unsigned shift) { + using T = type_of<decltype(type)>; + if (shift < sizeof(subtype<T>)) + { + const T x(value); + CHECK(std::is_same<decltype(x << shift), T>::value); + CHECK((x << shift) == apply( + [=](auto x) -> decltype(x) { + return bitcast<decltype(x)>( + static_cast<uitype<decltype(x)>>(uibitcast(x) << shift)); + }, + x)); + CHECK((x << broadcast<T::scalar_size()>(utype<subtype<T>>(shift))) == + apply( + [=](auto x) -> decltype(x) { + return bitcast<decltype(x)>( + static_cast<uitype<decltype(x)>>(uibitcast(x) << shift)); + }, + x)); + } + }); +} + +TEST(shr) +{ + testo::matrix( + named("type") = test_catogories::types(test_catogories::vectors), named("value1") = special_values(), + named("shift") = std::vector<unsigned>{ 1, 2, 7, 8, 9, 15, 16, 31, 32, 63, 64 }, + [&](auto type, special_value value, unsigned shift) { + using T = type_of<decltype(type)>; + if (shift < sizeof(subtype<T>)) + { + const T x(value); + CHECK(std::is_same<decltype(x << shift), T>::value); + CHECK((x >> shift) == apply( + [=](auto x) -> decltype(x) { + return bitcast<decltype(x)>( + static_cast<uitype<decltype(x)>>(uibitcast(x) >> shift)); + }, + x)); + CHECK((x >> broadcast<T::scalar_size()>(utype<subtype<T>>(shift))) == + apply( + [=](auto x) -> decltype(x) { + return bitcast<decltype(x)>( + static_cast<uitype<decltype(x)>>(uibitcast(x) >> shift)); + }, + x)); + } + }); +} + +TEST(eq) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return (x == y).asvec(); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return internal::maskbits<subtype<decltype(x)>>(x == y); + }); +} + +TEST(ne) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return (x != y).asvec(); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return internal::maskbits<subtype<decltype(x)>>(x != y); + }); +} + +TEST(ge) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return (x >= y).asvec(); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return internal::maskbits<subtype<decltype(x)>>(x >= y); + }); +} + +TEST(le) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return (x <= y).asvec(); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return internal::maskbits<subtype<decltype(x)>>(x <= y); + }); +} + +TEST(gt) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return (x > y).asvec(); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return internal::maskbits<subtype<decltype(x)>>(x > y); + }); +} + +TEST(lt) +{ + test_function2(test_catogories::vectors, [](auto x, auto y) { return (x < y).asvec(); }, + [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { + return internal::maskbits<subtype<decltype(x)>>(x < y); + }); +} + +TEST(horner) +{ + CHECK(horner(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 17, 34)); + CHECK(horner_odd(pack(0, 1, 2, 3), 1, 2, 3) == pack(0, 6, 114, 786)); + CHECK(horner_even(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 57, 262)); +} + +TEST(matrix) +{ + using i32x2x2 = vec<vec<int, 2>, 2>; + const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } }; + CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } }); + + CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } }); + + i32x2 xy{ 10, 20 }; + i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } }; + xy = hadd(xy * m); + CHECK(xy == i32x2{ 40, 120 }); + + i32x2 xy2{ 10, 20 }; + xy2 = hadd(transpose(xy2 * m)); + CHECK(xy2 == i32x2{ 50, 110 }); +} + +TEST(apply) +{ + CHECK(apply([](int x) { return x + 1; }, make_vector(1, 2, 3, 4, 5)) == make_vector(2, 3, 4, 5, 6)); + CHECK(apply(fn::sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25)); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/simd/shuffle.cpp b/tests/unit/simd/shuffle.cpp @@ -0,0 +1,160 @@ +#include <kfr/simd/shuffle.hpp> +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(concat) +{ + CHECK(concat(vec<f32, 1>{ 1 }, vec<f32, 2>{ 2, 3 }, vec<f32, 1>{ 4 }, vec<f32, 3>{ 5, 6, 7 }) // + == vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }); +} + +TEST(reverse) +{ + CHECK(reverse(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(7, 6, 5, 4, 3, 2, 1, 0)); + CHECK(reverse<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(6, 7, 4, 5, 2, 3, 0, 1)); + CHECK(reverse<4>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(4, 5, 6, 7, 0, 1, 2, 3)); +} + +TEST(shuffle) +{ + const vec<int, 8> numbers1 = enumerate<int, 8>(); + const vec<int, 8> numbers2 = enumerate<int, 8, 100>(); + CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) == + vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); + CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); +} + +TEST(permute) +{ + const vec<int, 8> numbers1 = enumerate<int, 8>(); + CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); + CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); +} + +TEST(blend) +{ + const vec<int, 8> numbers1 = enumerate<int, 8>(); + const vec<int, 8> numbers2 = enumerate<int, 8, 100>(); + CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) == + vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); + CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) == + vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); +} + +TEST(duplicate_shuffle) +{ + CHECK(dup(pack(0, 1, 2, 3)) == pack(0, 0, 1, 1, 2, 2, 3, 3)); + CHECK(duphalfs(pack(0, 1, 2, 3)) == pack(0, 1, 2, 3, 0, 1, 2, 3)); + CHECK(dupeven(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 0, 2, 2, 4, 4, 6, 6)); + CHECK(dupodd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 1, 3, 3, 5, 5, 7, 7)); +} + +TEST(split_interleave) +{ + vec<f32, 1> a1; + vec<f32, 2> a23; + vec<f32, 1> a4; + vec<f32, 3> a567; + split(vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }, a1, a23, a4, a567); + CHECK(a1 == vec<f32, 1>{ 1 }); + CHECK(a23 == vec<f32, 2>{ 2, 3 }); + CHECK(a4 == vec<f32, 1>{ 4 }); + CHECK(a567 == vec<f32, 3>{ 5, 6, 7 }); + + CHECK(splitpairs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6, 1, 3, 5, 7)); + CHECK(splitpairs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7)); + + CHECK(interleavehalfs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 4, 1, 5, 2, 6, 3, 7)); + CHECK(interleavehalfs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7)); +} + +TEST(broadcast) +{ + CHECK(broadcast<8>(1) == pack(1, 1, 1, 1, 1, 1, 1, 1)); + CHECK(broadcast<8>(1, 2) == pack(1, 2, 1, 2, 1, 2, 1, 2)); + CHECK(broadcast<8>(1, 2, 3, 4) == pack(1, 2, 3, 4, 1, 2, 3, 4)); + CHECK(broadcast<8>(1, 2, 3, 4, 5, 6, 7, 8) == pack(1, 2, 3, 4, 5, 6, 7, 8)); + + CHECK(broadcast<5>(3.f) == vec<f32, 5>{ 3, 3, 3, 3, 3 }); + CHECK(broadcast<6>(1.f, 2.f) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 }); + CHECK(broadcast<6>(1.f, 2.f, 3.f) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 }); +} + +TEST(resize) +{ + CHECK(resize<5>(make_vector(3.f)) == vec<f32, 5>{ 3, 3, 3, 3, 3 }); + CHECK(resize<6>(make_vector(1.f, 2.f)) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 }); + CHECK(resize<6>(make_vector(1.f, 2.f, 3.f)) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 }); +} + +TEST(make_vector) +{ + const signed char ch = -1; + CHECK(make_vector(1, 2, ch) == vec<i32, 3>{ 1, 2, -1 }); + const i64 v = -100; + CHECK(make_vector(1, 2, v) == vec<i64, 3>{ 1, 2, -100 }); + CHECK(make_vector<i64>(1, 2, ch) == vec<i64, 3>{ 1, 2, -1 }); + CHECK(make_vector<f32>(1, 2, ch) == vec<f32, 3>{ 1, 2, -1 }); + + CHECK(make_vector(f64x2{ 1, 2 }, f64x2{ 10, 20 }) == + vec<vec<f64, 2>, 2>{ f64x2{ 1, 2 }, f64x2{ 10, 20 } }); + CHECK(make_vector(1.f, f32x2{ 10, 20 }) == vec<vec<f32, 2>, 2>{ f32x2{ 1, 1 }, f32x2{ 10, 20 } }); +} + +TEST(zerovector) +{ + CHECK(zerovector<f32, 3>() == f32x3{ 0, 0, 0 }); + // CHECK(zerovector<i16, 3>() == i16x3{ 0, 0, 0 }); // clang 3.9 (trunk) crashes here + CHECK(zerovector(f64x8{}) == f64x8{ 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +TEST(allonesvector) +{ + CHECK(bitcast<u32>(special_constants<f32>::allones()) == 0xFFFFFFFFu); + CHECK(bitcast<u64>(special_constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull); + + CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 }); + CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 }); +} + +TEST(transpose) +{ + const auto sixteen = enumerate<float, 16>(); + CHECK(transpose<4>(sixteen) == vec<float, 16>(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); +} + +TEST(odd_even) +{ + CHECK(even(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6)); + CHECK(odd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 3, 5, 7)); + + CHECK(even<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5)); + CHECK(odd<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(2, 3, 6, 7)); +} + +TEST(low_high) +{ + CHECK(low(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(1, 2, 3, 4)); + CHECK(high(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(5, 6, 7, 8)); + + CHECK(low(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 4>(1, 2, 3, 4)); + CHECK(high(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 3>(5, 6, 7)); + + CHECK(low(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 4>(1, 2, 3, 4)); + CHECK(high(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 2>(5, 6)); + + CHECK(low(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 4>(1, 2, 3, 4)); + CHECK(high(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 1>(5)); + + CHECK(low(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(1, 2)); + CHECK(high(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(3, 4)); + + CHECK(low(vec<u8, 3>(1, 2, 3)) == vec<u8, 2>(1, 2)); + CHECK(high(vec<u8, 3>(1, 2, 3)) == vec<u8, 1>(3)); + + CHECK(low(vec<u8, 2>(1, 2)) == vec<u8, 1>(1)); + CHECK(high(vec<u8, 2>(1, 2)) == vec<u8, 1>(2)); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp @@ -0,0 +1,114 @@ +#include <kfr/simd/vec.hpp> + +#include <kfr/io/tostring.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +TEST(cones) +{ + CHECK(vec<int, 2>(cones) == vec<int, 2>(-1, -1)); + CHECK(vec<float, 2>(cones) == vec<f32, 2>(bitcast<f32>(-1), bitcast<f32>(-1))); +} +TEST(vec_broadcast) +{ + CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f }); + CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 }); + CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 }); +} +template <typename Tout, typename Tin> +bool is_in_range_of(Tin x) +{ + return (is_f_class<Tin>::value && is_f_class<Tout>::value) || static_cast<Tin>(static_cast<Tout>(x)) == x; +} + +TEST(cast) +{ + testo::assert_is_same<i32x4, kfr::common_type<i32x4>>(); + testo::assert_is_same<u32x4, kfr::common_type<i32x4, u32x4>>(); + testo::assert_is_same<f64x4, kfr::common_type<i32x4, u32x4, f64x4>>(); + + CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 }); + + CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{ + vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) == + vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } }); + + static_assert(std::is_convertible<float, f32x4>::value, ""); + static_assert(std::is_convertible<float, f64x8>::value, ""); + static_assert(std::is_convertible<float, u8x3>::value, ""); + + static_assert(std::is_convertible<u16x4, i32x4>::value, ""); + static_assert(!std::is_convertible<u16x4, i32x3>::value, ""); + static_assert(!std::is_convertible<u16x1, u16x16>::value, ""); + + static_assert(is_same<decltype(innercast<f64>(f32x4x4(1))), f64x4x4>::value, ""); + static_assert(is_same<decltype(innercast<f64>(f32x4(1))), f64x4>::value, ""); + static_assert(is_same<decltype(innercast<f64>(f32(1))), f64>::value, ""); + + // N/A static_assert(is_same<decltype(innercast<f64x4>(f32x4x4(1))), f64x4x4>::value, ""); + static_assert(is_same<decltype(innercast<f64x4>(f32x4(1))), f64x4x4>::value, ""); + static_assert(is_same<decltype(innercast<f64x4>(f32(1))), f64x4>::value, ""); + + // N/A static_assert(is_same<decltype(elemcast<f64>(f32x4x4(1))), f64x4>::value, ""); + static_assert(is_same<decltype(elemcast<f64>(f32x4(1))), f64x4>::value, ""); + + static_assert(is_same<decltype(elemcast<f64x4>(f32x4x4(1))), f64x4x4>::value, ""); + static_assert(is_same<decltype(elemcast<f64x4>(f32x4(1))), f64x4x4>::value, ""); + + testo::scope s(""); + s.text = ("target_type = u8"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<u8>(x); }, + [](auto x) -> u8 { return static_cast<u8>(x); }, + [](auto t, special_value x) { return is_in_range_of<u8>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = i8"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<i8>(x); }, + [](auto x) -> i8 { return static_cast<i8>(x); }, + [](auto t, special_value x) { return is_in_range_of<i8>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = u16"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<u16>(x); }, + [](auto x) -> u16 { return static_cast<u16>(x); }, + [](auto t, special_value x) { return is_in_range_of<u16>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = i16"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<i16>(x); }, + [](auto x) -> i16 { return static_cast<i16>(x); }, + [](auto t, special_value x) { return is_in_range_of<i16>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = u32"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<u32>(x); }, + [](auto x) -> u32 { return static_cast<u32>(x); }, + [](auto t, special_value x) { return is_in_range_of<u32>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = i32"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<i32>(x); }, + [](auto x) -> i32 { return static_cast<i32>(x); }, + [](auto t, special_value x) { return is_in_range_of<i32>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = u64"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<u64>(x); }, + [](auto x) -> u64 { return static_cast<u64>(x); }, + [](auto t, special_value x) { return is_in_range_of<u64>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = i64"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<i64>(x); }, + [](auto x) -> i64 { return static_cast<i64>(x); }, + [](auto t, special_value x) { return is_in_range_of<i64>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = f32"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<f32>(x); }, + [](auto x) -> f32 { return static_cast<f32>(x); }, + [](auto t, special_value x) { return is_in_range_of<f32>(x.get<subtype<type_of<decltype(t)>>>()); }); + s.text = ("target_type = f64"); + test_function1( + test_catogories::all, [](auto x) { return kfr::innercast<f64>(x); }, + [](auto x) -> f64 { return static_cast<f64>(x); }, + [](auto t, special_value x) { return is_in_range_of<f64>(x.get<subtype<type_of<decltype(t)>>>()); }); +} + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright (C) 2016 D Levin (http://www.kfrlib.com) +# This file is part of KFR +# +# KFR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# KFR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with KFR. + + +cmake_minimum_required(VERSION 3.1) + +# Binary output directories +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) + +add_executable(sample_rate_converter sample_rate_converter.cpp) +target_link_libraries(sample_rate_converter kfr kfr_io use_arch) + +add_executable(ebu_test ebu_test.cpp) +target_link_libraries(ebu_test kfr kfr_io use_arch) diff --git a/tools/ebu_test.cpp b/tools/ebu_test.cpp @@ -0,0 +1,120 @@ +/** + * KFR (http://kfrlib.com) + * Copyright (C) 2016 D Levin + * See LICENSE.txt for details + */ + +#include <kfr/base.hpp> +#include <kfr/dsp.hpp> +#include <kfr/io.hpp> + +using namespace kfr; + +int main(int argc, char** argv) +{ + if (argc < 3) + { + println("Usage: ebu_test INPUT_IN_F32_RAW_FORMAT CHANNEL_NUMBER"); + return 1; + } + + // Prepare + FILE* f = fopen(argv[1], "rb"); + const int channel_number = atoi(argv[2]); + if (channel_number < 1 || channel_number > 6) + { + println("Incorrect number of channels"); + return 1; + } + fseek(f, 0, SEEK_END); + uintmax_t size = ftell(f); + fseek(f, 0, SEEK_SET); + if (size % (sizeof(float) * channel_number)) + { + println("Incorrect file size"); + return 1; + } + + // Read file + const size_t length = size / (sizeof(float) * channel_number); + univector<float> interleaved(size / sizeof(float)); + size_t read_len = fread(interleaved.data(), 1, size, f); + if (read_len != size) + { + println("Can't read file"); + return 1; + } + + // Deinterleave + univector<univector<float>> data(channel_number, univector<float>(length)); + for (size_t ch = 0; ch < channel_number; ++ch) + { + for (size_t i = 0; i < length; ++i) + { + data[ch][i] = interleaved[i * channel_number + ch]; + } + } + + std::vector<Speaker> speakers; + switch (channel_number) + { + case 1: + speakers = { Speaker::Mono }; + break; + case 2: + speakers = { Speaker::Left, Speaker::Right }; + break; + case 3: + speakers = { Speaker::Left, Speaker::Right, Speaker::Center }; + break; + case 4: + speakers = { Speaker::Left, Speaker::Right, Speaker::LeftSurround, Speaker::RightSurround }; + break; + case 5: + speakers = { Speaker::Left, Speaker::Right, Speaker::Center, Speaker::LeftSurround, + Speaker::RightSurround }; + break; + case 6: + speakers = { Speaker::Left, Speaker::Right, Speaker::Center, + Speaker::LeftSurround, Speaker::RightSurround, Speaker::Lfe }; + break; + } + + ebu_r128<float> loudness(48000, speakers); + + float M, S, I, RL, RH; + float maxM = -HUGE_VALF, maxS = -HUGE_VALF; + for (size_t i = 0; i < length / loudness.packet_size(); i++) + { + std::vector<univector_ref<float>> channels; + for (size_t ch = 0; ch < channel_number; ++ch) + { + channels.push_back(data[ch].slice(i * loudness.packet_size(), loudness.packet_size())); + } + loudness.process_packet(channels); + loudness.get_values(M, S, I, RL, RH); + maxM = std::max(maxM, M); + maxS = std::max(maxS, S); + } + + { + // For file-based measurements, the signal should be followed by at least 1.5 s of silence + std::vector<univector_dyn<float>> channels(channel_number, + univector_dyn<float>(loudness.packet_size())); + for (size_t i = 0; i < 15; ++i) + loudness.process_packet(channels); + float dummyM, dummyS, dummyI; + loudness.get_values(dummyM, dummyS, dummyI, RL, RH); + } + + println(argv[1]); + println("M = ", M); + println("S = ", S); + println("I = ", I); + println("LRA = ", RH - RL); + println("maxM = ", maxM); + println("maxS = ", maxS); + println(); + + return 0; +} diff --git a/examples/sample_rate_converter.cpp b/tools/sample_rate_converter.cpp diff --git a/update-sources.py b/update-sources.py @@ -7,25 +7,33 @@ import subprocess import sys import glob -path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'include') +def list_sources(name, searchpath, masks): + global cmake + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), searchpath) + filenames = [] + for root, dirnames, files in os.walk(path, path): + for mask in masks: + for filename in fnmatch.filter(files, mask): + filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/')) -masks = ['*.hpp', '*.h', '*.i', '*.inc'] + cmake += """ +set( + """ + name + """ + """ + "\n ".join(['${PROJECT_SOURCE_DIR}/' + searchpath + '/' + f for f in filenames]) + """ +) -filenames = [] -for root, dirnames, files in os.walk(path, path): - for mask in masks: - for filename in fnmatch.filter(files, mask): - filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/')) + """ cmake = """ # Auto-generated file. Do not edit # Use update-sources.py - -set( - KFR_SRC - """ + "\n ".join(['${PROJECT_SOURCE_DIR}/include/' + f for f in filenames]) + """ -) """ +list_sources("KFR_SRC", "include", ['*.hpp', '*.h', '*.i', '*.inc']) +list_sources("KFR_DFT_SRC", "include/kfr/dft", ['*.cpp']) +list_sources("KFR_IO_SRC", "include/kfr/io", ['*.cpp']) + +list_sources("KFR_UNITTEST_SRC", "tests/unit", ['*.cpp']) + with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sources.cmake'), "w") as f: f.write(cmake)