kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 85000e10217241e44402e582ffe366e1a8932389
parent c9da5fa2939fce901f20629aececfc967673e17c
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 11 Oct 2016 03:29:41 +0300

Constants refactoring, macros for pragma, MSVC support

Diffstat:
MCMakeLists.txt | 3+++
Mexamples/biquads.cpp | 2+-
Mexamples/dft.cpp | 4+---
Mexamples/fir.cpp | 2+-
Mexamples/sample_rate_conversion.cpp | 2+-
Mexamples/window.cpp | 2+-
Minclude/kfr/base/abs.hpp | 8++++----
Minclude/kfr/base/atan.hpp | 6++++--
Minclude/kfr/base/basic_expressions.hpp | 22+++++++++++-----------
Minclude/kfr/base/complex.hpp | 40+++++++++++++++++++++++++++++++++++-----
Minclude/kfr/base/constants.hpp | 258++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Minclude/kfr/base/cpuid.hpp | 18+++++++++++++++++-
Minclude/kfr/base/digitreverse.hpp | 16++++++++++++----
Minclude/kfr/base/expression.hpp | 11++++++-----
Minclude/kfr/base/function.hpp | 59++++++++++++++++++++++++++++-------------------------------
Minclude/kfr/base/gamma.hpp | 6+++---
Minclude/kfr/base/generators.hpp | 14+++++++-------
Minclude/kfr/base/kfr.h | 15+++++++++------
Minclude/kfr/base/log_exp.hpp | 26+++++++++++++-------------
Minclude/kfr/base/logical.hpp | 16++++++++--------
Minclude/kfr/base/memory.hpp | 11++++++++---
Minclude/kfr/base/modzerobessel.hpp | 89+++++++++++++++++++++++++++++++++++++++----------------------------------------
Minclude/kfr/base/operators.hpp | 16++++++++--------
Minclude/kfr/base/platform.hpp | 113+++++++++++++++++++++++++++++++++++--------------------------------------------
Minclude/kfr/base/pointer.hpp | 7++++---
Minclude/kfr/base/random.hpp | 18++++++++++++------
Minclude/kfr/base/read_write.hpp | 74+++++++++++++++++++++++++++++++++++++-------------------------------------
Minclude/kfr/base/reduce.hpp | 50+++++++++++++++++++++++++-------------------------
Minclude/kfr/base/select.hpp | 8++++----
Minclude/kfr/base/shuffle.hpp | 30+++++++++++++-----------------
Minclude/kfr/base/simd.hpp | 87++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Minclude/kfr/base/sin_cos.hpp | 92+++++++++++++++++++++++++++++++++++--------------------------------------------
Minclude/kfr/base/sort.hpp | 4++--
Minclude/kfr/base/tan.hpp | 68++++++++++++++++++++++++++++++++++----------------------------------
Minclude/kfr/base/types.hpp | 62+++++++++++++-------------------------------------------------
Minclude/kfr/base/univector.hpp | 14++++++++++----
Minclude/kfr/base/vec.hpp | 210++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Minclude/kfr/cident.h | 50++++++++++++++++++++++++++++++++++++++++++++++++--
Minclude/kfr/cometa.hpp | 262+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Minclude/kfr/cometa/array.hpp | 5+++++
Minclude/kfr/cometa/cstring.hpp | 14+++++++-------
Minclude/kfr/cometa/ctti.hpp | 6+++---
Minclude/kfr/cometa/function.hpp | 2+-
Minclude/kfr/cometa/named_arg.hpp | 5+++++
Minclude/kfr/cometa/string.hpp | 71+++++++++++++++++++++++++++++++++++++++++------------------------------
Minclude/kfr/data/sincos.hpp | 220+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Minclude/kfr/dft/bitrev.hpp | 24++++++++++++------------
Minclude/kfr/dft/cache.hpp | 17++++++++++-------
Minclude/kfr/dft/conv.hpp | 10+++++-----
Minclude/kfr/dft/fft.hpp | 310++++++++++++++++++++++++++++++++++++++++---------------------------------------
Minclude/kfr/dft/ft.hpp | 371+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Minclude/kfr/dsp/biquad.hpp | 6+++---
Minclude/kfr/dsp/delay.hpp | 2+-
Minclude/kfr/dsp/fir.hpp | 2+-
Minclude/kfr/dsp/mixdown.hpp | 14+++++++++++---
Minclude/kfr/dsp/oscillators.hpp | 43++++++++++++++++++++++---------------------
Minclude/kfr/dsp/sample_rate_conversion.hpp | 4++--
Minclude/kfr/dsp/units.hpp | 20++++++++++----------
Minclude/kfr/dsp/window.hpp | 8++++----
Minclude/kfr/io/audiofile.hpp | 71++++++++++++++++++++++++++++++++++++++++-------------------------------
Minclude/kfr/io/python_plot.hpp | 6+++---
Minclude/kfr/io/tostring.hpp | 4++++
Mtests/CMakeLists.txt | 21+++++++++++++--------
Mtests/base_test.cpp | 22+++++++++++++++-------
Mtests/complex_test.cpp | 86+++++++++++++++++++++++++++++++++++++++----------------------------------------
Mtests/dft_test.cpp | 2+-
Mtests/dsp_test.cpp | 13++++++-------
Mtests/empty_test.cpp | 2+-
Mtests/expression_test.cpp | 2+-
Mtests/intrinsic_test.cpp | 28++++++++++++++++++++++++----
Mtests/multiarch.cpp | 2+-
Mtests/testo/testo.hpp | 24++++++++++++------------
Mtests/transcendental_test.cpp | 17+++++++++--------
73 files changed, 1887 insertions(+), 1362 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -56,6 +56,9 @@ set(CMAKE_C_FLAGS "${OPT_TARGET} ${OPT_BITNESS} ${OPT_STATIC}" CACHE STRING "com project(kfr) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin/Debug) + include(sources.cmake) set(ALL_WARNINGS -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-c99-extensions -Wno-padded) diff --git a/examples/biquads.cpp b/examples/biquads.cpp @@ -10,7 +10,7 @@ using namespace kfr; -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/examples/dft.cpp b/examples/dft.cpp @@ -11,7 +11,7 @@ using namespace kfr; -int main(int argc, char** argv) +int main() { println(library_version()); @@ -45,7 +45,5 @@ int main(int argc, char** argv) println(in); println(); println(dB); - (void)argc; - (void)argv; return 0; } diff --git a/examples/fir.cpp b/examples/fir.cpp @@ -10,7 +10,7 @@ using namespace kfr; -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/examples/sample_rate_conversion.cpp b/examples/sample_rate_conversion.cpp @@ -15,7 +15,7 @@ constexpr size_t output_sr = 44100; constexpr size_t len = 96000 * 6; constexpr fbase i32max = 2147483647.0; -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/examples/window.cpp b/examples/window.cpp @@ -10,7 +10,7 @@ using namespace kfr; -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp @@ -41,7 +41,7 @@ namespace intrinsics template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) { - return x & internal::invhighbitmask<T>; + return x & constants<T>::invhighbitmask(); } KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); } @@ -86,7 +86,7 @@ KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); } #if defined CMT_ARCH_NEON64 KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); } #else -KFR_SINTRIN f64neon abs(const f64neon& x) { return x & internal::invhighbitmask<f64>; } +KFR_SINTRIN f64neon abs(const f64neon& x) { return x & constants<f64>::invhighbitmask(); } #endif KFR_HANDLE_ALL_SIZES_1(abs) @@ -97,14 +97,14 @@ KFR_HANDLE_ALL_SIZES_1(abs) template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) { - return x & internal::invhighbitmask<T>; + return x & constants<T>::invhighbitmask(); } // fallback template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) { - return select(x >= T(), x, -x); + return select(x >= T(0), x, -x); } #endif KFR_I_CONVERTER(abs) diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp @@ -36,8 +36,9 @@ namespace kfr namespace intrinsics { template <size_t N> -KFR_SINTRIN vec<f32, N> atan2k(vec<f32, N> y, vec<f32, N> x) +KFR_SINTRIN vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx) { + vec<f32, N> x = xx, y = yy; vec<f32, N> s, t, u; vec<i32, N> q; q = select(x < 0, -2, 0); @@ -64,8 +65,9 @@ KFR_SINTRIN vec<f32, N> atan2k(vec<f32, N> y, vec<f32, N> x) } template <size_t N> -KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x) +KFR_SINTRIN vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx) { + vec<f64, N> x = xx, y = yy; vec<f64, N> s, t, u; vec<i64, N> q; q = select(x < 0, i64(-2), i64(0)); diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp @@ -276,7 +276,7 @@ struct expression_linspace<T, true> : input_expression return mix((enumerate(x) + cast<T>(cast<TI>(index))) * invsize, cast<T>(start), cast<T>(stop)); } template <typename U, size_t N> - CMT_INLINE static vec<U, N> mix(vec<U, N> t, U x, U y) + CMT_INLINE static vec<U, N> mix(const vec<U, N>& t, U x, U y) { return (U(1.0) - t) * x + t * y; } @@ -296,12 +296,12 @@ public: using T = value_type; template <typename... Expr_> - CMT_INLINE expression_sequence(const size_t (&segments)[base::size], Expr_&&... expr) noexcept + CMT_INLINE expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) noexcept : base(std::forward<Expr_>(expr)...) { std::copy(std::begin(segments), std::end(segments), this->segments.begin() + 1); - this->segments[0] = 0; - this->segments[base::size + 1] = size_t(-1); + this->segments[0] = 0; + this->segments[base::count + 1] = size_t(-1); } template <size_t N> @@ -314,7 +314,7 @@ public: else { vec<T, N> result; -#pragma clang loop unroll_count(4) + CMT_PRAGMA_CLANG(clang loop unroll_count(4)) for (size_t i = 0; i < N; i++) { sindex = segments[sindex + 1] == index ? sindex + 1 : sindex; @@ -329,12 +329,12 @@ protected: template <size_t N> CMT_NOINLINE vec<T, N> get(cinput_t cinput, size_t index, size_t expr_index, vec_t<T, N> y) { - return cswitch(indicesfor<E...>, expr_index, + return cswitch(indicesfor_t<E...>(), expr_index, [&](auto val) { return this->argument(cinput, val, index, y); }, [&]() { return zerovector(y); }); } - std::array<size_t, base::size + 2> segments; + std::array<size_t, base::count + 2> segments; }; template <typename Fn, typename E> @@ -475,7 +475,7 @@ struct multioutput : output_expression template <typename T, size_t N> void operator()(coutput_t coutput, size_t index, const vec<T, N>& x) { - cfor(csize<0>, csize<sizeof...(E)>, + cfor(csize_t<0>(), csize_t<sizeof...(E)>(), [&](auto n) { std::get<val_of(decltype(n)())>(outputs)(coutput, index, x); }); } std::tuple<E...> outputs; @@ -517,7 +517,7 @@ struct expression_unpack : private expression<E...>, output_expression template <typename U, size_t N> CMT_INLINE void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x) { - output(coutput, index, x, csizeseq<count>); + output(coutput, index, x, csizeseq_t<count>()); } template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)> @@ -576,9 +576,9 @@ task_partition<OutExpr, InExpr> partition(OutExpr&& output, InExpr&& input, size { static_assert(!is_infinite<OutExpr>::value || !is_infinite<InExpr>::value, ""); - minimum_size = minimum_size == 0 ? vector_width<T> * 8 : minimum_size; + minimum_size = minimum_size == 0 ? platform<T>::vector_width * 8 : minimum_size; const size_t size = size_min(output.size(), input.size()); - const size_t chunk_size = align_up(std::max(size / count, minimum_size), vector_width<T>); + const size_t chunk_size = align_up(std::max(size / count, minimum_size), platform<T>::vector_width); task_partition<OutExpr, InExpr> result(std::forward<OutExpr>(output), std::forward<InExpr>(input), size, chunk_size, (size + chunk_size - 1) / chunk_size); diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp @@ -40,6 +40,9 @@ #include <complex> #endif +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4814)) + namespace kfr { #ifdef KFR_STD_COMPLEX @@ -72,8 +75,13 @@ struct complex constexpr complex(complex<U>&& other) noexcept : re(std::move(other.re)), im(std::move(other.im)) { } +#ifdef CMT_COMPILER_GNU constexpr complex& operator=(const complex&) noexcept = default; constexpr complex& operator=(complex&&) noexcept = default; +#else + complex& operator=(const complex&) = default; + complex& operator=(complex&&) = default; +#endif constexpr const T& real() const noexcept { return re; } constexpr const T& imag() const noexcept { return im; } constexpr void real(T value) noexcept { re = value; } @@ -158,7 +166,7 @@ struct compound_type_traits<kfr::complex<T>> template <typename U> using rebind = kfr::complex<U>; template <typename U> - using deep_rebind = kfr::complex<cometa::deep_rebind<subtype, U>>; + using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>; static constexpr subtype at(const kfr::complex<T>& value, size_t index) { @@ -189,15 +197,15 @@ struct vec_op<complex<T>, N> : private vec_op<T, N * 2> constexpr static size_t w = N * 2; - CMT_INLINE constexpr static simd<scalar_type, w> mul(const simd<scalar_type, w>& x, - const simd<scalar_type, w>& y) noexcept + CMT_INLINE static simd<scalar_type, w> mul(const simd<scalar_type, w>& x, + const simd<scalar_type, w>& y) noexcept { const vec<scalar_type, w> xx = x; const vec<scalar_type, w> yy = y; return *subadd(xx * dupeven(yy), swap<2>(xx) * dupodd(yy)); } - CMT_INLINE constexpr static simd<scalar_type, w> div(const simd<scalar_type, w>& x, - const simd<scalar_type, w>& y) noexcept + CMT_INLINE static simd<scalar_type, w> div(const simd<scalar_type, w>& x, + const simd<scalar_type, w>& y) noexcept { const vec<scalar_type, w> xx = x; const vec<scalar_type, w> yy = y; @@ -670,4 +678,26 @@ struct common_type<T1, kfr::complex<T2>> { using type = kfr::complex<typename common_type<T1, T2>::type>; }; +template <typename T1, typename T2, size_t N> +struct common_type<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>> +{ + using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; +}; +template <typename T1, typename T2, size_t N> +struct common_type<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>> +{ + using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; +}; +template <typename T1, typename T2, size_t N> +struct common_type<kfr::complex<T1>, kfr::vec<T2, N>> +{ + using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; +}; +template <typename T1, typename T2, size_t N> +struct common_type<kfr::vec<T1, N>, kfr::complex<T2>> +{ + using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>; +}; } + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/constants.hpp b/include/kfr/base/constants.hpp @@ -31,66 +31,256 @@ namespace kfr { +#if CMT_COMPILER_GNU +constexpr double infinity = __builtin_inf(); +constexpr double qnan = __builtin_nan(""); +#else +constexpr double infinity = HUGE_VAL; +constexpr double qnan = NAN; +#endif + +template <typename T> +struct constants +{ +public: + using Tsub = subtype<T>; + + constexpr static Tsub pi_s(int m, int d = 1) { return pi * m / d; } + constexpr static Tsub recip_pi_s(int m, int d = 1) { return recip_pi * m / d; } + + constexpr static Tsub pi = static_cast<Tsub>(3.1415926535897932384626433832795); + constexpr static Tsub sqr_pi = static_cast<Tsub>(9.8696044010893586188344909998762); + constexpr static Tsub recip_pi = static_cast<Tsub>(0.31830988618379067153776752674503); + constexpr static Tsub degtorad = static_cast<Tsub>(pi / 180); + constexpr static Tsub radtodeg = static_cast<Tsub>(pi * 180); + constexpr static Tsub e = static_cast<Tsub>(2.718281828459045235360287471352662); + constexpr static Tsub recip_log_2 = static_cast<Tsub>(1.442695040888963407359924681001892137426645954); + constexpr static Tsub recip_log_10 = static_cast<Tsub>(0.43429448190325182765112891891661); + constexpr static Tsub log_2 = static_cast<Tsub>(0.69314718055994530941723212145818); + constexpr static Tsub log_10 = static_cast<Tsub>(2.3025850929940456840179914546844); + constexpr static Tsub sqrt_2 = static_cast<Tsub>(1.4142135623730950488016887242097); + + constexpr static Tsub fold_constant_div = choose_const<Tsub>( + CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01)); + + constexpr static Tsub fold_constant_hi = choose_const<Tsub>( + CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01)); + constexpr static Tsub fold_constant_rem1 = + choose_const<Tsub>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f), + CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08)); + constexpr static Tsub fold_constant_rem2 = + choose_const<Tsub>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f), + CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15)); + + constexpr static Tsub epsilon = std::numeric_limits<Tsub>::epsilon(); + constexpr static Tsub infinity = std::numeric_limits<Tsub>::infinity(); + constexpr static Tsub neginfinity = -std::numeric_limits<Tsub>::infinity(); + constexpr static Tsub qnan = std::numeric_limits<Tsub>::quiet_NaN(); + +#if CMT_COMPILER_GNU + + constexpr static Tsub allones() + { + if (is_same<Tsub, f32>::value) + { + return -__builtin_nanf("0xFFFFFFFF"); + } + else if (is_same<Tsub, f64>::value) + { + return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); + } + else + { + return static_cast<Tsub>(-1ll); + } + } + + constexpr static Tsub allzeros() { return Tsub(0); } + + constexpr static Tsub highbitmask() + { + if (is_same<Tsub, f32>::value) + { + return -0.0f; + } + else if (is_same<Tsub, f64>::value) + { + return -0.0; + } + else + { + return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1)); + } + } + + constexpr static Tsub invhighbitmask() + { + if (is_same<Tsub, f32>::value) + { + return __builtin_nanf("0xFFFFFFFF"); + } + else if (is_same<Tsub, f64>::value) + { + return __builtin_nan("0xFFFFFFFFFFFFFFFF"); + } + else + { + return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1); + } + } +#else + + static Tsub allones() + { + if (is_same<Tsub, f32>::value) + { + return static_cast<Tsub>(bitcast<f32>(0xFFFFFFFFu)); + } + else if (is_same<Tsub, f64>::value) + { + return static_cast<Tsub>(bitcast<f64>(0xFFFFFFFFFFFFFFFFull)); + } + else + { + return static_cast<Tsub>(-1ll); + } + } + + constexpr static Tsub allzeros() { return Tsub(0); } + + static Tsub highbitmask() + { + if (is_same<Tsub, f32>::value) + { + return static_cast<Tsub>(-0.0f); + } + else if (is_same<Tsub, f64>::value) + { + return static_cast<Tsub>(-0.0); + } + else + { + return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1)); + } + } + + static Tsub invhighbitmask() + { + if (is_same<Tsub, f32>::value) + { + return static_cast<Tsub>(bitcast<f32>(0x7FFFFFFFu)); + } + else if (is_same<Tsub, f64>::value) + { + return static_cast<Tsub>(bitcast<f64>(0x7FFFFFFFFFFFFFFFull)); + } + else + { + return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1); + } + } +#endif +}; + +template <typename T> +constexpr subtype<T> constants<T>::pi; +template <typename T> +constexpr subtype<T> constants<T>::sqr_pi; +template <typename T> +constexpr subtype<T> constants<T>::recip_pi; +template <typename T> +constexpr subtype<T> constants<T>::degtorad; +template <typename T> +constexpr subtype<T> constants<T>::radtodeg; +template <typename T> +constexpr subtype<T> constants<T>::e; +template <typename T> +constexpr subtype<T> constants<T>::recip_log_2; +template <typename T> +constexpr subtype<T> constants<T>::recip_log_10; +template <typename T> +constexpr subtype<T> constants<T>::log_2; +template <typename T> +constexpr subtype<T> constants<T>::log_10; +template <typename T> +constexpr subtype<T> constants<T>::sqrt_2; +template <typename T> +constexpr subtype<T> constants<T>::fold_constant_div; +template <typename T> +constexpr subtype<T> constants<T>::fold_constant_hi; +template <typename T> +constexpr subtype<T> constants<T>::fold_constant_rem1; +template <typename T> +constexpr subtype<T> constants<T>::fold_constant_rem2; +template <typename T> +constexpr subtype<T> constants<T>::epsilon; +template <typename T> +constexpr subtype<T> constants<T>::infinity; +template <typename T> +constexpr subtype<T> constants<T>::neginfinity; +template <typename T> +constexpr subtype<T> constants<T>::qnan; + // π (pi) // c_pi<f64, 4> = 4pi // c_pi<f64, 3, 4> = 3/4pi -template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>> -constexpr Tsub c_pi = Tsub(3.1415926535897932384626433832795 * m / d); +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d); // π² (pi²) // c_sqr_pi<f64, 4> = 4pi² // c_sqr_pi<f64, 3, 4> = 3/4pi² -template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>> -constexpr Tsub c_sqr_pi = Tsub(9.8696044010893586188344909998762 * m / d); +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d); // 1/π (1/pi) // c_recip_pi<f64> 1/pi // c_recip_pi<f64, 4> 4/pi -template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>> -constexpr Tsub c_recip_pi = Tsub(0.31830988618379067153776752674503 * m / d); +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d); // degree to radian conversion factor -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_degtorad = c_pi<T, 1, 180>; +template <typename T> +constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>; // radian to degree conversion factor -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_radtodeg = c_recip_pi<T, 180>; +template <typename T> +constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>; // e, Euler's number -template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>> -constexpr Tsub c_e = Tsub(2.718281828459045235360287471352662 * m / d); +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d); -template <typename T, typename Tsub = subtype<T>> -constexpr unsigned c_mantissa_bits = sizeof(Tsub) == 32 ? 23 : 52; +template <typename T> +constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52; -template <typename T, typename Tsub = usubtype<T>> -constexpr Tsub c_mantissa_mask = (Tsub(1) << c_mantissa_bits<T>)-1; +template <typename T> +constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1; -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_epsilon = (std::numeric_limits<Tsub>::epsilon()); +template <typename T> +constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon()); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_infinity = std::numeric_limits<Tsub>::infinity(); +template <typename T> +constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity(); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_neginfinity = -std::numeric_limits<Tsub>::infinity(); +template <typename T> +constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity(); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_qnan = std::numeric_limits<Tsub>::quiet_NaN(); +template <typename T> +constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN(); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_recip_log_2 = Tsub(1.442695040888963407359924681001892137426645954); +template <typename T> +constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_recip_log_10 = Tsub(0.43429448190325182765112891891661); +template <typename T> +constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_log_2 = Tsub(0.69314718055994530941723212145818); +template <typename T> +constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818); -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub c_log_10 = Tsub(2.3025850929940456840179914546844); +template <typename T> +constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844); -template <typename T, int m = 1, int d = 1, typename Tsub = subtype<T>> -constexpr Tsub c_sqrt_2 = Tsub(1.4142135623730950488016887242097 * m / d); +template <typename T, int m = 1, int d = 1> +constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d); } diff --git a/include/kfr/base/cpuid.hpp b/include/kfr/base/cpuid.hpp @@ -25,6 +25,10 @@ */ #pragma once +#ifdef _MSC_VER +#include <intrin.h> +#endif + #include "types.hpp" #include <cstring> @@ -121,9 +125,21 @@ CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) CMT_INLINE u32 get_xcr0() { u32 xcr0; - __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); + __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); return xcr0; } +#elif defined CMT_COMPILER_MSVC + +CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) { __cpuidex((int*)ptr, (int)func, (int)subfunc); } +CMT_INLINE u32 get_xcr0() +{ +#ifdef _XCR_XFEATURE_ENABLED_MASK + unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + return (u32)Result; +#else + return 0; +#endif +} #endif template <size_t = 0> diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp @@ -48,13 +48,14 @@ constexpr inline u32 bit_permute_step_simple(u32 x, u32 m, u32 shift) return ((x & m) << shift) | ((x >> shift) & m); } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshift-count-overflow" -#pragma GCC diagnostic ignored "-Wshift-count-negative" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative") template <size_t radix, size_t bits> constexpr enable_if<radix == 4, u32> digitreverse(u32 x) { +#ifdef CMT_COMPILER_GNU if (bits <= 2) return x; if (bits <= 4) @@ -84,9 +85,16 @@ constexpr enable_if<radix == 4, u32> digitreverse(u32 x) return x >> (32 - bits); } return x; +#else + x = bit_permute_step_simple(x, 0x33333333, 2); // Bit index complement 1 regroups 4 bits + x = bit_permute_step_simple(x, 0x0f0f0f0f, 4); // Bit index complement 2 regroups 8 bits + x = bit_permute_step_simple(x, 0x00ff00ff, 8); // Bit index complement 3 regroups 16 bits + x = bit_permute_step_simple(x, 0x0000ffff, 16); // Bit index complement 4 regroups 32 bits + return x >> (32 - bits); +#endif } -#pragma GCC diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) template <size_t radix, size_t bits> struct shuffle_index_digitreverse diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp @@ -31,8 +31,8 @@ #include <tuple> -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wshadow" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") namespace kfr { @@ -399,9 +399,9 @@ CMT_INLINE size_t process(OutputExpr&& out, const InputExpr& in, size_t start = in.begin_block(cinput, size); #ifdef NDEBUG - constexpr size_t w = width == 0 ? internal::get_vector_width<Tin, c>(2, 4) : width; + constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width * bitness_const(2, 4) : width; #else - constexpr size_t w = width == 0 ? internal::get_vector_width<Tin, c>(1, 1) : width; + constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width * bitness_const(1, 1) : width; #endif size_t i = start; @@ -442,4 +442,5 @@ struct output_expression_base : output_expression } }; } -#pragma clang diagnostic pop + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp @@ -30,8 +30,8 @@ #include "types.hpp" #include "vec.hpp" -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wshadow" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") namespace kfr { @@ -127,10 +127,11 @@ template <cpu_t c, typename T> constexpr inline size_t next_simd_width(size_t n) { #ifdef CMT_ARCH_X86 - return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>; + return n > platform<T, cpu_t::sse2>::vector_width ? platform<T, c>::vector_width + : platform<T, cpu_t::sse2>::vector_width; #endif #ifdef CMT_ARCH_ARM - return vector_width<T, cpu_t::neon>; + return platform<T, cpu_t::neon>::vector_width; #endif } @@ -147,116 +148,112 @@ KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) } #define KFR_HANDLE_ALL_SIZES_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_FLT_1(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a)))); \ } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a)))); \ } #define KFR_HANDLE_ALL_SIZES_F_1(fn) \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_f_class<T>::value)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_f_class<T>::value)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_f_class<T>::value), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_f_class<T>::value), \ + typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_I_1(fn) \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_i_class<T>::value)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_i_class<T>::value)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_i_class<T>::value), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_i_class<T>::value), \ + typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_U_1(fn) \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_u_class<T>::value)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_u_class<T>::value)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_u_class<T>::value), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_u_class<T>::value), \ + typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn) \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && !is_f_class<T>::value)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && !is_f_class<T>::value)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ - template <typename T, size_t N, \ - KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && !is_f_class<T>::value), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && !is_f_class<T>::value), \ + typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_2(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \ { \ return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \ } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \ { \ return concat(fn(low(a), low(b)), fn(high(a), high(b))); \ } #define KFR_HANDLE_ALL_SIZES_3(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \ { \ return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c))); \ } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \ { \ return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c))); \ } #define KFR_HANDLE_ALL_SIZES_4(fn) \ - template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ + template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \ { \ return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d))); \ } \ - template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ + template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \ { \ return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d))); \ @@ -277,4 +274,4 @@ inline T to_scalar(const vec<T, 1>& value) } } } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp @@ -27,9 +27,9 @@ #include "function.hpp" #include "log_exp.hpp" -#pragma clang diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wc99-extensions") -#pragma clang diagnostic ignored "-Wc99-extensions" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") #endif namespace kfr @@ -92,4 +92,4 @@ KFR_INTRIN internal::expression_function<fn::factorial_approx, E1> factorial_app } } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp @@ -58,7 +58,7 @@ protected: template <size_t N> void call_shift(csize_t<N>) const { - ptr_cast<Class>(this)->shift(csize<N>); + ptr_cast<Class>(this)->shift(csize_t<N>()); } template <size_t N> @@ -81,7 +81,7 @@ protected: CMT_INLINE vec<T, N> generate(vec_t<T, N>) const { const vec<T, N> result = narrow<N>(value); - shift(csize<N>); + shift(csize_t<N>()); return result; } @@ -96,7 +96,7 @@ protected: mutable vec<T, width> value; }; -template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2)> +template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2)> struct generator_linear : generator<T, width, generator_linear<T, width>> { constexpr generator_linear(T start, T step) noexcept : step(step), vstep(step* width) @@ -113,7 +113,7 @@ protected: T vstep; }; -template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP> +template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP> struct generator_exp : generator<T, width, generator_exp<T, width>> { generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step* width))[0] - 1) @@ -130,7 +130,7 @@ protected: T vstep; }; -template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP> +template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP> struct generator_exp2 : generator<T, width, generator_exp2<T, width>> { generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step* width))[0] - 1) @@ -147,7 +147,7 @@ protected: T vstep; }; -template <typename T, size_t width = get_vector_width<T, cpu_t::native>(1, 2), KFR_ARCH_DEP> +template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP> struct generator_cossin : generator<T, width, generator_cossin<T, width>> { generator_cossin(T start, T step) @@ -172,7 +172,7 @@ protected: } }; -template <typename T, size_t width = get_vector_width<T, cpu_t::native>(2, 4), KFR_ARCH_DEP> +template <typename T, size_t width = platform<T>::vector_width* bitness_const(2, 4), KFR_ARCH_DEP> struct generator_sin : generator<T, width, generator_sin<T, width>> { generator_sin(T start, T step) diff --git a/include/kfr/base/kfr.h b/include/kfr/base/kfr.h @@ -14,6 +14,14 @@ #define KFR_VERSION_BUILD 0 #define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_BUILD) +#ifdef CMT_ARCH_X64 +#define KFR_VERSION_FULL \ + "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit (" CMT_COMPIER_NAME ")" +#else +#define KFR_VERSION_FULL \ + "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit (" CMT_COMPIER_NAME ")" +#endif + #ifdef __cplusplus namespace kfr { @@ -22,14 +30,9 @@ constexpr int version_major = KFR_VERSION_MAJOR; constexpr int version_minor = KFR_VERSION_MINOR; constexpr int version_build = KFR_VERSION_BUILD; constexpr int version = KFR_VERSION; +constexpr const char version_full[] = KFR_VERSION_FULL; } #endif -#ifdef CMT_ARCH_X64 -#define KFR_VERSION_FULL "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit" -#else -#define KFR_VERSION_FULL "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit" -#endif - #define KFR_INTRIN CMT_INTRIN #define KFR_SINTRIN CMT_INTRIN static diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp @@ -96,7 +96,7 @@ KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d) vec<f32, N> x = (m - 1.0f) / (m + 1.0f); vec<f32, N> x2 = x * x; - vec<f32, N> sp = select(d < 0, c_qnan<f32>, c_neginfinity<f32>); + vec<f32, N> sp = select(d < 0, constants<f32>::qnan, constants<f32>::neginfinity); vec<f32, N> t = 0.2371599674224853515625f; t = fmadd(t, x2, 0.285279005765914916992188f); @@ -119,7 +119,7 @@ KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d) vec<f64, N> x = (m - 1.0) / (m + 1.0); vec<f64, N> x2 = x * x; - vec<f64, N> sp = select(d < 0, c_qnan<f64>, c_neginfinity<f64>); + vec<f64, N> sp = select(d < 0, constants<f64>::qnan, constants<f64>::neginfinity); vec<f64, N> t = 0.148197055177935105296783; t = fmadd(t, x2, 0.153108178020442575739679); @@ -130,7 +130,7 @@ KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d) t = fmadd(t, x2, 0.666666666666685503450651); t = fmadd(t, x2, 2); - x = x * t + c_log_2<f64> * cast<f64>(e); + x = x * t + constants<f64>::log_2 * cast<f64>(e); x = select(d > 0, x, sp); return x; @@ -139,12 +139,12 @@ KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d) template <typename T, size_t N, typename Tout = flt_type<T>> KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x) { - return log(cast<Tout>(x)) * c_recip_log_2<Tout>; + return log(cast<Tout>(x)) * constants<Tout>::recip_log_2; } template <typename T, size_t N, typename Tout = flt_type<T>> KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x) { - return log(cast<Tout>(x)) * c_recip_log_10<Tout>; + return log(cast<Tout>(x)) * constants<Tout>::recip_log_10; } template <size_t N> @@ -153,7 +153,7 @@ KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d) const f32 ln2_part1 = 0.6931457519f; const f32 ln2_part2 = 1.4286067653e-6f; - vec<i32, N> q = cast<i32>(floor(d * c_recip_log_2<f32>)); + vec<i32, N> q = cast<i32>(floor(d * constants<f32>::recip_log_2)); vec<f32, N> s, u; s = fmadd(cast<f32>(q), -ln2_part1, d); @@ -176,7 +176,7 @@ KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d) u = s * s * u + s + 1.0f; u = vldexpk(u, q); - u = select(d == c_neginfinity<f32>, 0.f, u); + u = select(d == constants<f32>::neginfinity, 0.f, u); return u; } @@ -187,7 +187,7 @@ KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d) const f64 ln2_part1 = 0.69314717501401901245; const f64 ln2_part2 = 5.545926273775592108e-009; - vec<i64, N> q = cast<i64>(floor(d * c_recip_log_2<f64>)); + vec<i64, N> q = cast<i64>(floor(d * +constants<f64>::recip_log_2)); vec<f64, N> s, u; s = fmadd(cast<f64>(q), -ln2_part1, d); @@ -218,19 +218,19 @@ KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d) u = s * s * u + s + 1.0; u = vldexpk(u, q); - u = select(d == c_neginfinity<f64>, 0.0, u); + u = select(d == constants<f64>::neginfinity, 0.0, u); return u; } template <typename T, size_t N, typename Tout = flt_type<T>> KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x) { - return exp(x * c_log_2<Tout>); + return exp(x * constants<Tout>::log_2); } template <typename T, size_t N, typename Tout = flt_type<T>> KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x) { - return exp(x * c_log_10<Tout>); + return exp(x * constants<Tout>::log_10); } template <typename T, size_t N> @@ -239,8 +239,8 @@ KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b) const vec<T, N> t = exp(b * log(abs(a))); const mask<T, N> isint = floor(b) == b; const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0; - return select(a > T(), t, - select(a == T(), T(1), select(isint, select(iseven, t, -t), broadcast<N>(c_qnan<T>)))); + return select(a > T(), t, select(a == T(), T(1), + select(isint, select(iseven, t, -t), broadcast<N>(constants<T>::qnan)))); } template <typename T, size_t N> diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -139,23 +139,23 @@ KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); } KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); } #endif -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN bool bittestall(const vec<T, N>& a) { return bittestall(expand_simd(a, internal::maskbits<T>(true))); } -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN bool bittestall(const vec<T, N>& a) { return bittestall(low(a)) && bittestall(high(a)); } -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN bool bittestany(const vec<T, N>& a) { return bittestany(expand_simd(a, internal::maskbits<T>(false))); } -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN bool bittestany(const vec<T, N>& a) { return bittestany(low(a)) || bittestany(high(a)); @@ -192,23 +192,23 @@ KFR_SINTRIN bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a KFR_SINTRIN bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); } KFR_SINTRIN bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); } -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN bool bittestall(const vec<T, N>& a) { return bittestall(expand_simd(a, internal::maskbits<T>(true))); } -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN bool bittestall(const vec<T, N>& a) { return bittestall(low(a)) && bittestall(high(a)); } -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN bool bittestany(const vec<T, N>& a) { return bittestany(expand_simd(a, internal::maskbits<T>(false))); } -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN bool bittestany(const vec<T, N>& a) { return bittestany(low(a)) || bittestany(high(a)); diff --git a/include/kfr/base/memory.hpp b/include/kfr/base/memory.hpp @@ -57,7 +57,11 @@ struct mem_header u8 reserved1; u8 reserved2; size_t size; -} __attribute__((__packed__)); +} +#ifdef CMT_GNU_ATTRIBUTES +__attribute__((__packed__)) +#endif +; inline mem_header* aligned_header(void* ptr) { return ptr_cast<mem_header>(ptr) - 1; } @@ -85,11 +89,12 @@ inline void aligned_free(void* ptr) } } -template <typename T = void, size_t alignment = native_cache_alignment> +template <typename T = void, size_t alignment = platform<>::native_cache_alignment> CMT_INLINE T* aligned_allocate(size_t size = 1) { T* ptr = static_cast<T*>(CMT_ASSUME_ALIGNED( - internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>), alignment), alignment)); + internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment), + alignment)); return ptr; } diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp @@ -27,9 +27,9 @@ #include "function.hpp" #include "log_exp.hpp" -#pragma clang diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wc99-extensions") -#pragma clang diagnostic ignored "-Wc99-extensions" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") #endif namespace kfr @@ -38,50 +38,49 @@ namespace kfr namespace intrinsics { -template <typename T> -constexpr T bessel_coef[] = { T(0.25), - T(0.027777777777777776236), - T(0.0017361111111111110147), - T(6.9444444444444444384e-005), - T(1.9290123456790123911e-006), - T(3.9367598891408417495e-008), - T(6.1511873267825652335e-010), - T(7.5940584281266239246e-012), - T(7.5940584281266233693e-014), - T(6.2760813455591932909e-016), - T(4.3583898233049949985e-018), - T(2.5789288895295827557e-020), - T(1.3157800456783586208e-022), - T(5.8479113141260384983e-025), - T(2.2843403570804837884e-027), - T(7.904291893012054025e-030), - T(2.4395962632753252792e-032), - T(6.75788438580422547e-035), - T(1.689471096451056426e-037), - T(3.8310002187098784929e-040), - T(7.9152897080782616517e-043), - T(1.4962740468957016443e-045), - T(2.5976979980828152196e-048), - T(4.1563167969325041577e-051), - T(6.1483976285983795968e-054), - T(8.434015951438105991e-057), - T(1.0757673407446563809e-059), - T(1.2791526049282476926e-062), - T(1.4212806721424974034e-065), - T(1.4789601166935457918e-068), - T(1.4442969889585408123e-071), - T(1.3262598613026086927e-074), - T(1.1472836170437790782e-077), - T(9.3655805472961564331e-081), - T(7.2265282000741942594e-084), - T(5.2786911614858977913e-087), - T(3.6556032974279072401e-090), - T(2.4034209713529963119e-093), - T(1.5021381070956226783e-096) }; - template <typename T, size_t N> CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x) { + constexpr static T bessel_coef[] = { T(0.25), + T(0.027777777777777776236), + T(0.0017361111111111110147), + T(6.9444444444444444384e-005), + T(1.9290123456790123911e-006), + T(3.9367598891408417495e-008), + T(6.1511873267825652335e-010), + T(7.5940584281266239246e-012), + T(7.5940584281266233693e-014), + T(6.2760813455591932909e-016), + T(4.3583898233049949985e-018), + T(2.5789288895295827557e-020), + T(1.3157800456783586208e-022), + T(5.8479113141260384983e-025), + T(2.2843403570804837884e-027), + T(7.904291893012054025e-030), + T(2.4395962632753252792e-032), + T(6.75788438580422547e-035), + T(1.689471096451056426e-037), + T(3.8310002187098784929e-040), + T(7.9152897080782616517e-043), + T(1.4962740468957016443e-045), + T(2.5976979980828152196e-048), + T(4.1563167969325041577e-051), + T(6.1483976285983795968e-054), + T(8.434015951438105991e-057), + T(1.0757673407446563809e-059), + T(1.2791526049282476926e-062), + T(1.4212806721424974034e-065), + T(1.4789601166935457918e-068), + T(1.4442969889585408123e-071), + T(1.3262598613026086927e-074), + T(1.1472836170437790782e-077), + T(9.3655805472961564331e-081), + T(7.2265282000741942594e-084), + T(5.2786911614858977913e-087), + T(3.6556032974279072401e-090), + T(2.4034209713529963119e-093), + T(1.5021381070956226783e-096) }; + const vec<T, N> x_2 = x * 0.5; const vec<T, N> x_2_sqr = x_2 * x_2; vec<T, N> num = x_2_sqr; @@ -91,7 +90,7 @@ CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x) CMT_LOOP_UNROLL for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++) { - result = fmadd((num *= x_2_sqr), bessel_coef<T>[i], result); + result = fmadd((num *= x_2_sqr), bessel_coef[i], result); } return result; } @@ -113,4 +112,4 @@ KFR_INTRIN internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1 } } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp @@ -83,7 +83,7 @@ KFR_FN(add) /** * @brief Returns template expression that returns sum of all the arguments passed to a function. */ -template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)> +template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)> CMT_INLINE internal::expression_function<fn::add, E...> add(E&&... x) { return { fn::add(), std::forward<E>(x)... }; @@ -310,7 +310,7 @@ inline common_type<T1, T2> bitwiseand(const T1& x, const T2& y) template <typename T> constexpr inline T bitwiseand(initialvalue<T>) { - return internal::allones<subtype<T>>; + return constants<T>::allones(); } KFR_FN(bitwiseand) @@ -323,7 +323,7 @@ inline common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y) template <typename T> constexpr inline T bitwiseandnot(initialvalue<T>) { - return internal::allones<subtype<T>>; + return constants<T>::allones(); } KFR_FN(bitwiseandnot) @@ -336,7 +336,7 @@ inline common_type<T1, T2> bitwiseor(const T1& x, const T2& y) template <typename T> constexpr inline T bitwiseor(initialvalue<T>) { - return subtype<T>(); + return subtype<T>(0); } KFR_FN(bitwiseor) @@ -512,14 +512,14 @@ KFR_FN(reciprocal) template <typename T1, typename T2> CMT_INLINE common_type<T1, T2> mulsign(const T1& x, const T2& y) { - return x ^ (y & internal::highbitmask<subtype<T2>>); + return x ^ (y & constants<T2>::highbitmask()); } KFR_FN(mulsign) template <typename T, size_t N> constexpr CMT_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y) { - return (x & internal::highbitmask<T>) | (y & internal::highbitmask<T>); + return (x & constants<T>::highbitmask()) | (y & constants<T>::highbitmask()); } template <typename T, size_t N> @@ -531,7 +531,7 @@ CMT_INLINE mask<T, N> isnan(const vec<T, N>& x) template <typename T, size_t N> CMT_INLINE mask<T, N> isinf(const vec<T, N>& x) { - return x == c_infinity<T> || x == -c_infinity<T>; + return x == constants<T>::infinity || x == -constants<T>::infinity; } template <typename T, size_t N> @@ -543,7 +543,7 @@ CMT_INLINE mask<T, N> isfinite(const vec<T, N>& x) template <typename T, size_t N> CMT_INLINE mask<T, N> isnegative(const vec<T, N>& x) { - return (x & internal::highbitmask<T>) != 0; + return (x & constants<T>::highbitmask()) != 0; } template <typename T, size_t N> diff --git a/include/kfr/base/platform.hpp b/include/kfr/base/platform.hpp @@ -93,94 +93,81 @@ CMT_UNUSED static const char* cpu_name(cpu_t set) return "-"; } +#ifdef CMT_ARCH_X64 +template <int = 0> +constexpr inline const char* bitness_const(const char*, const char* x64) +{ + return x64; +} template <typename T> -constexpr inline const T& bitness_const(const T& x32, const T& x64) +constexpr inline const T& bitness_const(const T&, const T& x64) { -#ifdef CMT_ARCH_X64 - (void)x32; return x64; +} #else - (void)x64; +template <int = 0> +constexpr inline const char* bitness_const(const char* x32, const char*) +{ + return x32; +} +template <typename T> +constexpr inline const T& bitness_const(const T& x32, const T&) +{ return x32; -#endif } - -#ifdef CMT_ARCH_X64 -constexpr inline const char* bitness_const(const char*, const char* x64) { return x64; } -#else -constexpr inline const char* bitness_const(const char* x32, const char*) { return x32; } #endif -constexpr size_t native_cache_alignment = 64; -constexpr size_t native_cache_alignment_mask = native_cache_alignment - 1; -constexpr size_t maximum_vector_alignment = 32; -constexpr size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1; -constexpr size_t native_register_count = bitness_const(8, 16); +template <typename T = i32, cpu_t c = cpu_t::native> +struct platform +{ + constexpr static size_t native_cache_alignment = 64; + constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1; + constexpr static size_t maximum_vector_alignment = 32; + constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1; +#ifdef CMT_ARCH_X86 + constexpr static size_t simd_register_count = bitness_const(8, 16); +#endif +#ifdef CMT_ARCH_ARM + constexpr static size_t simd_register_count = 16; +#endif -constexpr size_t common_float_vector_size = 16; -constexpr size_t common_int_vector_size = 16; + constexpr static size_t common_float_vector_size = 16; + constexpr static size_t common_int_vector_size = 16; -template <cpu_t c> -constexpr size_t native_float_vector_size = #ifdef CMT_ARCH_X86 - c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size; + constexpr static size_t native_float_vector_size = + c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size; #endif #ifdef CMT_ARCH_ARM -c == cpu_t::neon ? 16 : common_float_vector_size; + constexpr static size_t native_float_vector_size = c == cpu_t::neon ? 16 : common_float_vector_size; #endif -template <cpu_t c> -constexpr size_t native_int_vector_size = #ifdef CMT_ARCH_X86 - c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size; + constexpr static size_t native_int_vector_size = + c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size; #endif #ifdef CMT_ARCH_ARM -c == cpu_t::neon ? 16 : common_int_vector_size; + constexpr static size_t native_int_vector_size = c == cpu_t::neon ? 16 : common_int_vector_size; #endif -/// @brief SIMD vector width for the given cpu instruction set -template <typename T, cpu_t c = cpu_t::native> -constexpr size_t vector_width = const_max(size_t(1), typeclass<T> == datatype::f - ? native_float_vector_size<c> / sizeof(T) - : native_int_vector_size<c> / sizeof(T)); + /// @brief SIMD vector width for the given cpu instruction set + constexpr static size_t vector_width = + (const_max(size_t(1), typeclass<T> == datatype::f ? native_float_vector_size / sizeof(T) + : native_int_vector_size / sizeof(T))); -template <cpu_t c> -constexpr size_t vector_width<void, c> = 0; + constexpr static size_t vector_capacity = simd_register_count * vector_width; -namespace internal -{ + constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity / 4); -template <cpu_t c> -constexpr size_t native_vector_alignment = const_max(native_float_vector_size<c>, native_int_vector_size<c>); + constexpr static size_t native_vector_alignment = + const_max(native_float_vector_size, native_int_vector_size); -template <cpu_t c> -constexpr bool fast_unaligned = + constexpr static bool fast_unaligned = #ifdef CMT_ARCH_X86 - c >= cpu_t::avx1; + c >= cpu_t::avx1; #else - false; + false; #endif -template <cpu_t c> -constexpr size_t native_vector_alignment_mask = native_vector_alignment<c> - 1; - -template <typename T, cpu_t c> -constexpr inline size_t get_vector_width(size_t scale = 1) -{ - return scale * vector_width<T, c>; -} -template <typename T, cpu_t c> -constexpr inline size_t get_vector_width(size_t x32scale, size_t x64scale) -{ - return bitness_const(x32scale, x64scale) * vector_width<T, c>; -} - -template <typename T, cpu_t c> -constexpr auto vector_width_range = csize<1> << csizeseq<ilog2(vector_width<T, c>) + 1>; - -template <typename T, cpu_t c> -constexpr size_t vector_capacity = native_register_count* vector_width<T, c>; - -template <typename T, cpu_t c> -constexpr size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity<T, c> / 4); -} + constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1; +}; } diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp @@ -34,8 +34,10 @@ namespace kfr constexpr size_t maximum_expression_width = bitness_const(16, 32); +constexpr size_t expression_vtable_size = 2 + ilog2(maximum_expression_width) + 1; + template <typename T> -using expression_vtable = std::array<void*, 2 + ilog2(maximum_expression_width) + 1>; +using expression_vtable = std::array<void*, expression_vtable_size>; struct dummy_content { @@ -172,12 +174,11 @@ template <typename T, typename E> expression_vtable<T> make_expression_vtable_impl() { expression_vtable<T> result; - constexpr size_t size = result.size() - 2; result[0] = reinterpret_cast<void*>(internal::make_expression_begin_block<decay<E>>()); result[1] = reinterpret_cast<void*>(internal::make_expression_end_block<decay<E>>()); - cforeach(csizeseq<size>, [&](auto u) { + cforeach(csizeseq_t<expression_vtable_size - 2>(), [&](auto u) { constexpr size_t N = 1 << val_of(decltype(u)()); result[2 + val_of(decltype(u)())] = reinterpret_cast<void*>(internal::make_expression_func<T, N, decay<E>>()); diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp @@ -40,27 +40,33 @@ struct seed_from_rdtsc_t constexpr seed_from_rdtsc_t seed_from_rdtsc{}; +#ifdef CMT_COMPILER_CLANG +#define KFR_builtin_readcyclecounter() __builtin_readcyclecounter() +#else +#define KFR_builtin_readcyclecounter() __rdtsc() +#endif + struct random_bit_generator { random_bit_generator(seed_from_rdtsc_t) noexcept - : state(bitcast<u32>(make_vector(__builtin_readcyclecounter(), - (__builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull))) + : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(), + (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull))) { (void)operator()(); } - constexpr random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3) + random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3) { (void)operator()(); } - constexpr random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1))) + random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1))) { (void)operator()(); } inline random_state operator()() { - constexpr static random_state mul{ 214013u, 17405u, 214013u, 69069u }; - constexpr static random_state add{ 2531011u, 10395331u, 13737667u, 1u }; + CMT_GNU_CONSTEXPR static random_state mul{ 214013u, 17405u, 214013u, 69069u }; + CMT_GNU_CONSTEXPR static random_state add{ 2531011u, 10395331u, 13737667u, 1u }; state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add)))); return state; } diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp @@ -85,19 +85,19 @@ CMT_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<I template <typename T, size_t N> CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices) { - return internal::gather(base, indices, csizeseq<N>); + return internal::gather(base, indices, csizeseq_t<N>()); } template <size_t Nout, typename T> CMT_INLINE vec<T, Nout> gather_stride(const T* base, size_t stride) { - return internal::gather_stride_s<Nout>(base, stride, csizeseq<Nout>); + return internal::gather_stride_s<Nout>(base, stride, csizeseq_t<Nout>()); } template <size_t Nout, size_t Stride, typename T> CMT_INLINE vec<T, Nout> gather_stride(const T* base) { - return internal::gather_stride<Nout, Stride>(base, csizeseq<Nout>); + return internal::gather_stride<Nout, Stride>(base, csizeseq_t<Nout>()); } template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices> @@ -108,7 +108,7 @@ CMT_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& template <size_t groupsize = 1, typename T, size_t N, typename IT> CMT_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset) { - return gather_helper<groupsize>(base, offset, csizeseq<N>); + return gather_helper<groupsize>(base, offset, csizeseq_t<N>()); } template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices> @@ -121,42 +121,42 @@ CMT_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, N template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT> CMT_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value) { - return scatter_helper<groupsize>(base, offset, value, csizeseq<N>); + return scatter_helper<groupsize>(base, offset, value, csizeseq_t<N>()); } template <typename T> -constexpr T partial_masks[] = { internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, - internal::allones<T>, +constexpr T partial_masks[] = { constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), + constants<T>::allones(), T(), T(), T(), diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp @@ -61,10 +61,10 @@ CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value) return finalfn(value); } -template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, cpu_t cpu = cpu_t::native> +template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, KFR_ARCH_DEP> struct expression_reduce : output_expression { - constexpr static size_t width = vector_width<T, cpu> * bitness_const(1, 2); + constexpr static size_t width = platform<T>::vector_width * bitness_const(1, 2); using value_type = T; @@ -85,16 +85,16 @@ struct expression_reduce : output_expression protected: void reset() { counter = 0; } - CMT_INLINE void process(vec<T, width> x) const { value = reducefn(transformfn(x), value); } + CMT_INLINE void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); } template <size_t N, KFR_ENABLE_IF(N < width)> - CMT_INLINE void process(vec<T, N> x) const + CMT_INLINE void process(const vec<T, N>& x) const { value = combine(value, reducefn(transformfn(x), narrow<N>(value))); } template <size_t N, KFR_ENABLE_IF(N > width)> - CMT_INLINE void process(vec<T, N> x) const + CMT_INLINE void process(const vec<T, N>& x) const { process(low(x)); process(high(x)); @@ -110,14 +110,14 @@ protected: template <typename ReduceFn, typename TransformFn = fn::pass_through, typename FinalFn = fn::pass_through, typename E1, typename T = value_type_of<E1>> -KFR_SINTRIN T reduce(E1&& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(), +KFR_SINTRIN T reduce(const E1& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(), FinalFn&& finalfn = fn::pass_through()) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); using reducer_t = internal::expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>; reducer_t red(std::forward<ReduceFn>(reducefn), std::forward<TransformFn>(transformfn), std::forward<FinalFn>(finalfn)); - process(red, std::forward<E1>(e1)); + process(red, e1); return red.get(); } @@ -133,10 +133,10 @@ KFR_FN(reduce) * \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T sum(E1&& x) +KFR_SINTRIN T sum(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::add()); + return reduce(x, fn::add()); } /** @@ -148,10 +148,10 @@ KFR_SINTRIN T sum(E1&& x) * \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T mean(E1&& x) +KFR_SINTRIN T mean(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::add(), fn::pass_through(), fn::final_mean()); + return reduce(x, fn::add(), fn::pass_through(), fn::final_mean()); } /** @@ -160,10 +160,10 @@ KFR_SINTRIN T mean(E1&& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T minof(E1&& x) +KFR_SINTRIN T minof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::min()); + return reduce(x, fn::min()); } /** @@ -172,10 +172,10 @@ KFR_SINTRIN T minof(E1&& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T maxof(E1&& x) +KFR_SINTRIN T maxof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::max()); + return reduce(x, fn::max()); } /** @@ -184,10 +184,10 @@ KFR_SINTRIN T maxof(E1&& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T absminof(E1&& x) +KFR_SINTRIN T absminof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::absmin()); + return reduce(x, fn::absmin()); } /** @@ -196,10 +196,10 @@ KFR_SINTRIN T absminof(E1&& x) * x must have its size and type specified. */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T absmaxof(E1&& x) +KFR_SINTRIN T absmaxof(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::absmax()); + return reduce(x, fn::absmax()); } /** @@ -229,10 +229,10 @@ KFR_SINTRIN T dotproduct(E1&& x, E2&& y) \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T rms(E1&& x) +KFR_SINTRIN T rms(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::add(), fn::sqr(), fn::final_rootmean()); + return reduce(x, fn::add(), fn::sqr(), fn::final_rootmean()); } /** @@ -244,10 +244,10 @@ KFR_SINTRIN T rms(E1&& x) \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T sumsqr(E1&& x) +KFR_SINTRIN T sumsqr(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::add(), fn::sqr()); + return reduce(x, fn::add(), fn::sqr()); } /** @@ -259,9 +259,9 @@ KFR_SINTRIN T sumsqr(E1&& x) \f] */ template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)> -KFR_SINTRIN T product(E1&& x) +KFR_SINTRIN T product(const E1& x) { static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())"); - return reduce(std::forward<E1>(x), fn::mul()); + return reduce(x, fn::mul()); } } diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -121,12 +121,12 @@ KFR_SINTRIN i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y) } #endif -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c))); } -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c))); @@ -178,12 +178,12 @@ KFR_SINTRIN f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y } #endif -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c))); } -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c))); diff --git a/include/kfr/base/shuffle.hpp b/include/kfr/base/shuffle.hpp @@ -40,7 +40,7 @@ namespace internal template <size_t index, typename T> constexpr CMT_INLINE T broadcast_get_nth() { - return c_qnan<T>; + return constants<T>::qnan; } template <size_t index, typename T, typename... Ts> @@ -51,17 +51,17 @@ constexpr CMT_INLINE T broadcast_get_nth(T x, Ts... rest) template <typename T, typename... Ts, size_t... indices, size_t Nin = 1 + sizeof...(Ts), size_t Nout = sizeof...(indices)> -CMT_INLINE constexpr vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts... rest) +CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast_helper(csizes_t<indices...>, T x, Ts... rest) { - simd<T, Nout> result{ broadcast_get_nth<indices % Nin>(x, rest...)... }; - return result; + using simd_t = simd<T, Nout>; + return KFR_SIMD_SET(simd_t, broadcast_get_nth<indices % Nin>(x, rest...)...); } } template <size_t Nout, typename T, typename... Ts> -constexpr CMT_INLINE vec<T, Nout> broadcast(T x, T y, Ts... rest) +CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast(T x, T y, Ts... rest) { - return internal::broadcast_helper(csizeseq<Nout>, x, y, rest...); + return internal::broadcast_helper(csizeseq_t<Nout>(), x, y, rest...); } KFR_FN(broadcast) @@ -215,10 +215,9 @@ struct shuffle_index_shuffle constexpr static size_t indexcount = sizeof...(Indices); template <size_t index> - constexpr inline size_t operator()() const + constexpr inline size_t operator()(csize_t<index>) const { - constexpr int result = csizes_t<Indices...>::get(csize<index % indexcount>); - return result + index / indexcount * indexcount; + return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount; } }; } @@ -247,11 +246,9 @@ struct shuffle_index_permute constexpr static size_t indexcount = sizeof...(Indices); template <size_t index> - constexpr inline size_t operator()() const + constexpr inline size_t operator()(csize_t<index>) const { - constexpr size_t result = csizes_t<Indices...>::get(csize<index % indexcount>); - static_assert(result < size, "result < size"); - return result + index / indexcount * indexcount; + return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount; } }; } @@ -341,9 +338,9 @@ struct shuffle_index_blend constexpr static size_t indexcount = sizeof...(Indices); template <size_t index> - constexpr inline size_t operator()() const + constexpr inline size_t operator()(csize_t<index>) const { - return (elements_t<Indices...>::get(csize<index % indexcount>) ? size : 0) + index % size; + return (elements_t<Indices...>::get(csize_t<index % indexcount>()) ? size : 0) + index % size; } }; } @@ -434,8 +431,7 @@ struct shuffle_index_transpose { constexpr inline size_t operator()(size_t index) const { - constexpr size_t side2 = size / side1; - return index % side2 * side1 + index / side2; + return index % (size / side1) * side1 + index / (size / side1); } }; } diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp @@ -28,6 +28,9 @@ #include "kfr.h" #include "types.hpp" +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4324)) + namespace kfr { @@ -72,10 +75,11 @@ CMT_INLINE simd<T, N> simd_read(const T* src) template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> CMT_INLINE simd<T, N> simd_read(const T* src) { - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>); - constexpr auto concat_indices = csizeseq<N>; + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + constexpr auto extend_indices = + cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>()); + constexpr auto concat_indices = cvalseq_t<size_t, N>(); return simd_shuffle<T, first>(simd_read<first, A>(src), simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices), concat_indices); @@ -92,10 +96,11 @@ CMT_INLINE void simd_write(T* dest, const simd<T, N>& value) { constexpr size_t first = prev_poweroftwo(N); constexpr size_t rest = N - first; - simd_write<A, first>(dest, simd_shuffle(value, csizeseq<first>)); - simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq<rest, first>)); + simd_write<A, first>(dest, simd_shuffle(value, csizeseq_t<first>())); + simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq_t<rest, first>())); } +#define KFR_SIMD_SET(T, ...) (T{ __VA_ARGS__ }) #define KFR_SIMD_CAST(T, N, X) __builtin_convertvector(X, ::kfr::simd<T, N>) #define KFR_SIMD_BITCAST(T, N, X) ((::kfr::simd<T, N>)(X)) #define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X)) @@ -107,6 +112,9 @@ namespace internal { template <typename T> +constexpr inline T maskbits(bool value); + +template <typename T> struct simd_float_ops { constexpr static T neg(T x) { return -x; } @@ -144,74 +152,74 @@ struct simd_int_ops : simd_float_ops<T> } template <typename T, size_t N> -struct alignas(next_poweroftwo(N * sizeof(T))) simd +struct alignas(const_min(size_t(64), next_poweroftwo(N * sizeof(T)))) simd { using ops = conditional<std::is_floating_point<T>::value, internal::simd_float_ops<T>, internal::simd_int_ops<T>>; - constexpr static simd broadcast(T value) { return broadcast_impl(value, csizeseq<N>); } + constexpr static simd broadcast(T value) { return broadcast_impl(value, cvalseq_t<size_t, N>()); } constexpr friend simd operator+(const simd& x) { return x; } - constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, csizeseq<N>); } - constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, csizeseq<N>); } + constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, cvalseq_t<size_t, N>()); } + constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, cvalseq_t<size_t, N>()); } constexpr friend simd operator+(const simd& x, const simd& y) { - return op_impl<ops::add>(x, y, csizeseq<N>); + return op_impl<ops::add>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator-(const simd& x, const simd& y) { - return op_impl<ops::sub>(x, y, csizeseq<N>); + return op_impl<ops::sub>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator*(const simd& x, const simd& y) { - return op_impl<ops::mul>(x, y, csizeseq<N>); + return op_impl<ops::mul>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator/(const simd& x, const simd& y) { - return op_impl<ops::div>(x, y, csizeseq<N>); + return op_impl<ops::div>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator&(const simd& x, const simd& y) { - return op_impl<ops::band>(x, y, csizeseq<N>); + return op_impl<ops::band>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator|(const simd& x, const simd& y) { - return op_impl<ops::bor>(x, y, csizeseq<N>); + return op_impl<ops::bor>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator^(const simd& x, const simd& y) { - return op_impl<ops::bxor>(x, y, csizeseq<N>); + return op_impl<ops::bxor>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator<<(const simd& x, const simd& y) { - return op_impl<ops::shl>(x, y, csizeseq<N>); + return op_impl<ops::shl>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator>>(const simd& x, const simd& y) { - return op_impl<ops::shr>(x, y, csizeseq<N>); + return op_impl<ops::shr>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator==(const simd& x, const simd& y) { - return op_impl<ops::eq>(x, y, csizeseq<N>); + return op_impl<ops::eq>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator!=(const simd& x, const simd& y) { - return op_impl<ops::ne>(x, y, csizeseq<N>); + return op_impl<ops::ne>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator<(const simd& x, const simd& y) { - return op_impl<ops::lt>(x, y, csizeseq<N>); + return op_impl<ops::lt>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator>(const simd& x, const simd& y) { - return op_impl<ops::gt>(x, y, csizeseq<N>); + return op_impl<ops::gt>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator<=(const simd& x, const simd& y) { - return op_impl<ops::le>(x, y, csizeseq<N>); + return op_impl<ops::le>(x, y, cvalseq_t<size_t, N>()); } constexpr friend simd operator>=(const simd& x, const simd& y) { - return op_impl<ops::ge>(x, y, csizeseq<N>); + return op_impl<ops::ge>(x, y, cvalseq_t<size_t, N>()); } constexpr T operator[](size_t index) const { return items[index]; } T& operator[](size_t index) { return items[index]; } @@ -220,29 +228,29 @@ struct alignas(next_poweroftwo(N * sizeof(T))) simd template <typename U> constexpr simd<U, N> cast() const { - return cast_impl<U>(*this, csizeseq<N>); + return cast_impl<U>(*this, cvalseq_t<size_t, N>()); } private: template <typename U, size_t... indices> constexpr static simd<U, N> cast_impl(const simd& x, csizes_t<indices...>) { - return simd<U, N>{ static_cast<U>(x.items[indices])... }; + return simd<U, N>{ { static_cast<U>(x.items[indices])... } }; } template <T (*fn)(T), size_t... indices> constexpr static simd op_impl(const simd& x, csizes_t<indices...>) { - return simd{ fn(x.items[indices])... }; + return simd{ { fn(x.items[indices])... } }; } template <T (*fn)(T, T), size_t... indices> constexpr static simd op_impl(const simd& x, const simd& y, csizes_t<indices...>) { - return simd{ fn(x.items[indices], y.items[indices])... }; + return simd{ { fn(x.items[indices], y.items[indices])... } }; } template <size_t... indices> constexpr static simd broadcast_impl(T value, csizes_t<indices...>) { - return simd{ ((void)indices, value)... }; + return simd{ { ((void)indices, value)... } }; } }; @@ -256,12 +264,12 @@ template <typename T, size_t N, int... indices> constexpr CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const simd<T, N>& x, const simd<T, N>& y, cints_t<indices...>) noexcept { - return simd<T, sizeof...(indices)>{ (indices == -1 ? T() - : ((indices >= N) ? y[indices - N] : x[indices]))... }; + return simd<T, sizeof...(indices)>{ { ( + indices == -1 ? T() : ((indices >= N) ? y[indices - N] : x[indices]))... } }; } template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)> -constexpr CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept +CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept { union { const simd<From, N> from; @@ -297,19 +305,22 @@ CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, ctrue_t) template <size_t N, bool A = false, typename T> CMT_INLINE simd<T, N> simd_read(const T* src) { - return simd_read_impl<N>(src, cbool<A>); + return simd_read_impl<N>(src, cbool_t<A>()); } template <bool A = false, size_t N, typename T> CMT_INLINE void simd_write(T* dest, const simd<T, N>& value) { - return simd_write_impl<N>(dest, value, cbool<A>); + return simd_write_impl<N>(dest, value, cbool_t<A>()); } -#define KFR_SIMD_CAST(T, N, X) (::kfr::simd_cast<T>(X)) -#define KFR_SIMD_BITCAST(T, N, X) (::kfr::simd_bitcast<T>(X)) +#define KFR_SIMD_SET(T, ...) (T{ { __VA_ARGS__ } }) +#define KFR_SIMD_CAST(T, N, X) ((void)N, ::kfr::simd_cast<T>(X)) +#define KFR_SIMD_BITCAST(T, N, X) ((void)N, ::kfr::simd_bitcast<T>(X)) #define KFR_SIMD_BROADCAST(T, N, X) (::kfr::simd<T, N>::broadcast(X)) -#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints<__VA_ARGS__>) +#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints_t<__VA_ARGS__>()) #endif } + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp @@ -35,7 +35,7 @@ #include "shuffle.hpp" #if CMT_HAS_WARNING("-Wc99-extensions") -#pragma clang diagnostic ignored "-Wc99-extensions" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions") #endif namespace kfr @@ -44,16 +44,6 @@ namespace kfr namespace intrinsics { -template <typename T> -constexpr static T fold_constant_div = choose_const<T>(0x1.921fb6p-1f, 0x1.921fb54442d18p-1); - -template <typename T> -constexpr static T fold_constant_hi = choose_const<T>(0x1.922000p-1f, 0x1.921fb40000000p-1); -template <typename T> -constexpr static T fold_constant_rem1 = choose_const<T>(-0x1.2ae000p-19f, 0x1.4442d00000000p-25); -template <typename T> -constexpr static T fold_constant_rem2 = choose_const<T>(-0x1.de973ep-32f, 0x1.8469898cc5170p-49); - template <typename T, size_t N> KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0) { @@ -70,9 +60,9 @@ KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, con template <typename T, size_t N, typename Tprecise = f64> KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant) { - const vec<T, N> xabs = abs(x); - constexpr vec<T, N> div = fold_constant_div<T>; - vec<T, N> y = floor(xabs / div); + const vec<T, N> xabs = abs(x); + constexpr T div = constants<T>::fold_constant_div; + vec<T, N> y = floor(xabs / div); quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0)); const mask<T, N> msk = bitcast<T>((quadrant & 1) != 0); @@ -80,25 +70,25 @@ KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant) y = select(msk, y + T(1.0), y); quadrant = quadrant & 7; - constexpr vec<Tprecise, N> hi = cast<Tprecise>(fold_constant_hi<T>); - constexpr vec<T, N> rem1 = fold_constant_rem1<T>; - constexpr vec<T, N> rem2 = fold_constant_rem2<T>; + constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi); + constexpr T rem1 = constants<T>::fold_constant_rem1; + constexpr T rem2 = constants<T>::fold_constant_rem2; return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2; } template <size_t N> KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask) { - constexpr f32 sin_c2 = -0x2.aaaaacp-4f; - constexpr f32 sin_c4 = 0x2.222334p-8f; - constexpr f32 sin_c6 = -0xd.0566ep-16f; - constexpr f32 sin_c8 = 0x3.64cc1cp-20f; - constexpr f32 sin_c10 = -0x5.6c4a4p-24f; - constexpr f32 cos_c2 = -0x8.p-4f; - constexpr f32 cos_c4 = 0xa.aaaabp-8f; - constexpr f32 cos_c6 = -0x5.b05d48p-12f; - constexpr f32 cos_c8 = 0x1.a065f8p-16f; - constexpr f32 cos_c10 = -0x4.cd156p-24f; + constexpr f32 sin_c2 = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f); + constexpr f32 sin_c4 = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f); + constexpr f32 sin_c6 = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f); + constexpr f32 sin_c8 = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f); + constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f); + constexpr f32 cos_c2 = CMT_FP(-0x8.p-4f, -5.0000000000e-01f); + constexpr f32 cos_c4 = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f); + constexpr f32 cos_c6 = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f); + constexpr f32 cos_c8 = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f); + constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f); const vec<f32, N> x2 = folded * folded; @@ -112,22 +102,22 @@ KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N template <size_t N> KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask) { - constexpr f64 sin_c2 = -0x2.aaaaaaaaaaaaap-4; - constexpr f64 sin_c4 = 0x2.22222222220cep-8; - constexpr f64 sin_c6 = -0xd.00d00cffd6618p-16; - constexpr f64 sin_c8 = 0x2.e3bc744fb879ep-20; - constexpr f64 sin_c10 = -0x6.b99034c1467a4p-28; - constexpr f64 sin_c12 = 0xb.0711ea8fe8ee8p-36; - constexpr f64 sin_c14 = -0xb.7e010897e55dp-44; - constexpr f64 sin_c16 = -0xb.64eac07f1d6bp-48; - constexpr f64 cos_c2 = -0x8.p-4; - constexpr f64 cos_c4 = 0xa.aaaaaaaaaaaa8p-8; - constexpr f64 cos_c6 = -0x5.b05b05b05ad28p-12; - constexpr f64 cos_c8 = 0x1.a01a01a0022e6p-16; - constexpr f64 cos_c10 = -0x4.9f93ed845de2cp-24; - constexpr f64 cos_c12 = 0x8.f76bc015abe48p-32; - constexpr f64 cos_c14 = -0xc.9bf2dbe00379p-40; - constexpr f64 cos_c16 = 0xd.1232ac32f7258p-48; + constexpr f64 sin_c2 = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01); + constexpr f64 sin_c4 = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03); + constexpr f64 sin_c6 = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04); + constexpr f64 sin_c8 = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06); + constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08); + constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10); + constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13); + constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14); + constexpr f64 cos_c2 = CMT_FP(-0x8.p-4, -5.000000000000000000e-01); + constexpr f64 cos_c4 = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02); + constexpr f64 cos_c6 = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03); + constexpr f64 cos_c8 = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05); + constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07); + constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09); + constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11); + constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14); vec<f64, N> x2 = folded * folded; vec<f64, N> formula = @@ -193,7 +183,7 @@ KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x) template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x) { - constexpr vec<T, N> msk = broadcast<N>(internal::highbitmask<T>); + CMT_GNU_CONSTEXPR vec<T, N> msk = broadcast<N>(constants<T>::highbitmask()); constexpr static T c2 = -0.16665853559970855712890625; constexpr static T c4 = +8.31427983939647674560546875e-3; @@ -238,7 +228,7 @@ KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x) template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x) { - return select(abs(x) <= c_epsilon<T>, T(1), sin(x) / x); + return select(abs(x) <= constants<T>::epsilon, T(1), sin(x) / x); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>> @@ -294,37 +284,37 @@ KFR_I_FLT_CONVERTER(sinc) template <typename T, typename Tout = flt_type<T>> KFR_SINTRIN Tout sindeg(const T& x) { - return sin(x * c_degtorad<Tout>); + return sin(x * constants<Tout>::degtorad); } template <typename T, typename Tout = flt_type<T>> KFR_SINTRIN Tout cosdeg(const T& x) { - return cos(x * c_degtorad<Tout>); + return cos(x * constants<Tout>::degtorad); } template <typename T, typename Tout = flt_type<T>> KFR_SINTRIN Tout fastsindeg(const T& x) { - return fastsin(x * c_degtorad<Tout>); + return fastsin(x * constants<Tout>::degtorad); } template <typename T, typename Tout = flt_type<T>> KFR_SINTRIN Tout fastcosdeg(const T& x) { - return fastcos(x * c_degtorad<Tout>); + return fastcos(x * constants<Tout>::degtorad); } template <typename T, typename Tout = flt_type<T>> KFR_SINTRIN Tout sincosdeg(const T& x) { - return sincos(x * c_degtorad<Tout>); + return sincos(x * constants<Tout>::degtorad); } template <typename T, typename Tout = flt_type<T>> KFR_SINTRIN Tout cossindeg(const T& x) { - return cossin(x * c_degtorad<Tout>); + return cossin(x * constants<Tout>::degtorad); } } diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp @@ -45,7 +45,7 @@ CMT_INLINE vec<T, N> sort(const vec<T, N>& x) constexpr size_t Nhalf = N / 2; vec<T, Nhalf> e = low(x); vec<T, Nhalf> o = high(x); - constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); + constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>()); for (size_t i = 0; i < Nhalf; i++) { vec<T, Nhalf> t; @@ -78,7 +78,7 @@ CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x) constexpr size_t Nhalf = N / 2; vec<T, Nhalf> e = low(x); vec<T, Nhalf> o = high(x); - constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); + constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>()); for (size_t i = 0; i < Nhalf; i++) { vec<T, Nhalf> t; diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp @@ -64,19 +64,19 @@ KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full) mask<f32, N> inverse; const vec<f32, N> x = trig_fold_simple(x_full, inverse); - constexpr f32 tan_c2 = 0x5.555378p-4; - constexpr f32 tan_c4 = 0x2.225bb8p-4; - constexpr f32 tan_c6 = 0xd.ac3fep-8; - constexpr f32 tan_c8 = 0x6.41644p-8; - constexpr f32 tan_c10 = 0xc.bfe7ep-12; - constexpr f32 tan_c12 = 0x2.6754dp-8; - - constexpr f32 cot_c2 = -0x5.555558p-4; - constexpr f32 cot_c4 = -0x5.b0581p-8; - constexpr f32 cot_c6 = -0x8.ac5ccp-12; - constexpr f32 cot_c8 = -0xd.aaa01p-16; - constexpr f32 cot_c10 = -0x1.a9a9b4p-16; - constexpr f32 cot_c12 = -0x6.f7d4dp-24; + constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01); + constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01); + constexpr f32 tan_c6 = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02); + constexpr f32 tan_c8 = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02); + constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03); + constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03); + + constexpr f32 cot_c2 = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01); + constexpr f32 cot_c4 = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02); + constexpr f32 cot_c6 = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03); + constexpr f32 cot_c8 = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04); + constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05); + constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07); const vec<f32, N> x2 = x * x; const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, @@ -92,27 +92,27 @@ KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full) mask<f64, N> inverse; const vec<f64, N> x = trig_fold_simple(x_full, inverse); - constexpr f64 tan_c2 = 0x5.5555554d8e5b8p-4; - constexpr f64 tan_c4 = 0x2.222224820264p-4; - constexpr f64 tan_c6 = 0xd.d0d90de32b3e8p-8; - constexpr f64 tan_c8 = 0x5.99723bdcf5cacp-8; - constexpr f64 tan_c10 = 0x2.434a142e413ap-8; - constexpr f64 tan_c12 = 0xf.2b59061305efp-12; - constexpr f64 tan_c14 = 0x4.a12565071a664p-12; - constexpr f64 tan_c16 = 0x4.dada3797ac1bcp-12; - constexpr f64 tan_c18 = -0x1.a74976b6ea3f3p-12; - constexpr f64 tan_c20 = 0x1.d06a5ae5e4a74p-12; - - constexpr f64 cot_c2 = -0x5.5555555555554p-4; - constexpr f64 cot_c4 = -0x5.b05b05b05b758p-8; - constexpr f64 cot_c6 = -0x8.ab355dffc79a8p-12; - constexpr f64 cot_c8 = -0xd.debbca405c9f8p-16; - constexpr f64 cot_c10 = -0x1.66a8edb99b15p-16; - constexpr f64 cot_c12 = -0x2.450239be0ee92p-20; - constexpr f64 cot_c14 = -0x3.ad6ddb4719438p-24; - constexpr f64 cot_c16 = -0x5.ff4c42741356p-28; - constexpr f64 cot_c18 = -0x9.06881bcdf3108p-32; - constexpr f64 cot_c20 = -0x1.644abedc113cap-32; + constexpr f64 tan_c2 = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01); + constexpr f64 tan_c4 = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01); + constexpr f64 tan_c6 = CMT_FP(0xd.d0d90de32b3e8p-8, 5.396801556632355862e-02); + constexpr f64 tan_c8 = CMT_FP(0x5.99723bdcf5cacp-8, 2.187265359403693307e-02); + constexpr f64 tan_c10 = CMT_FP(0x2.434a142e413ap-8, 8.839254309582239566e-03); + constexpr f64 tan_c12 = CMT_FP(0xf.2b59061305efp-12, 3.703449009834865711e-03); + constexpr f64 tan_c14 = CMT_FP(0x4.a12565071a664p-12, 1.130243370829653185e-03); + constexpr f64 tan_c16 = CMT_FP(0x4.dada3797ac1bcp-12, 1.185276423238536747e-03); + constexpr f64 tan_c18 = CMT_FP(-0x1.a74976b6ea3f3p-12, -4.036779095551438937e-04); + constexpr f64 tan_c20 = CMT_FP(0x1.d06a5ae5e4a74p-12, 4.429010863244216712e-04); + + constexpr f64 cot_c2 = CMT_FP(-0x5.5555555555554p-4, -3.333333333333333148e-01); + constexpr f64 cot_c4 = CMT_FP(-0x5.b05b05b05b758p-8, -2.222222222222377391e-02); + constexpr f64 cot_c6 = CMT_FP(-0x8.ab355dffc79a8p-12, -2.116402116358796163e-03); + constexpr f64 cot_c8 = CMT_FP(-0xd.debbca405c9f8p-16, -2.116402122295888289e-04); + constexpr f64 cot_c10 = CMT_FP(-0x1.66a8edb99b15p-16, -2.137779458737224013e-05); + constexpr f64 cot_c12 = CMT_FP(-0x2.450239be0ee92p-20, -2.164426049513111728e-06); + constexpr f64 cot_c14 = CMT_FP(-0x3.ad6ddb4719438p-24, -2.191935496317727080e-07); + constexpr f64 cot_c16 = CMT_FP(-0x5.ff4c42741356p-28, -2.234152473099993830e-08); + constexpr f64 cot_c18 = CMT_FP(-0x9.06881bcdf3108p-32, -2.101416316020595077e-09); + constexpr f64 cot_c20 = CMT_FP(-0x1.644abedc113cap-32, -3.240456633529511097e-10); const vec<f64, N> x2 = x * x; const vec<f64, N> val = trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6, diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp @@ -30,8 +30,8 @@ #include <cmath> -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshadow" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") #include "../cometa.hpp" @@ -210,11 +210,14 @@ using returns = cometa::fn_returns<T>; } template <typename T> -using ftype = deep_rebind<T, float_type<typebits<deep_subtype<T>>::bits>>; +using ftype = + typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>; template <typename T> -using itype = deep_rebind<T, int_type<typebits<deep_subtype<T>>::bits>>; +using itype = + typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>; template <typename T> -using utype = deep_rebind<T, unsigned_type<typebits<deep_subtype<T>>::bits>>; +using utype = + typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>; template <typename T> using fsubtype = ftype<subtype<T>>; @@ -301,8 +304,8 @@ CMT_INLINE void zeroize(T1& value) builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1)); } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wattributes" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes") template <typename T, bool A> struct struct_with_alignment @@ -322,7 +325,7 @@ __attribute__((__packed__, __may_alias__)) // #endif ; -#pragma GCC diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) } template <typename T> @@ -330,45 +333,6 @@ struct initialvalue { }; -#if CMT_COMPILER_GNU -constexpr double infinity = __builtin_inf(); -constexpr double qnan = __builtin_nan(""); -#else -constexpr double infinity = HUGE_VAL; -constexpr double qnan = NAN; -#endif - -namespace internal -{ -#if CMT_COMPILER_GNU -constexpr f32 allones_f32 = -__builtin_nanf("0xFFFFFFFF"); -constexpr f64 allones_f64 = -__builtin_nan("0xFFFFFFFFFFFFFFFF"); -#else -constexpr f32 allones_f32 = -NAN; -constexpr f64 allones_f64 = -NAN; -#endif - -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub allones = choose_const<Tsub>(allones_f32, allones_f64, static_cast<Tsub>(-1)); - -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub allzeros = Tsub(); - -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub highbitmask = choose_const<Tsub>(-0.f, -0.0, 1ull << (typebits<T>::bits - 1)); - -template <typename T, typename Tsub = subtype<T>> -constexpr Tsub invhighbitmask = choose_const<Tsub>(__builtin_nanf("0xFFFFFFFF"), - __builtin_nan("0xFFFFFFFFFFFFFFFF"), - ~(1ull << (typebits<T>::bits - 1))); - -template <typename T> -constexpr inline T maskbits(bool value) -{ - return value ? internal::allones<T> : T(); -} -} - namespace internal { template <size_t width, typename Fn> @@ -376,7 +340,7 @@ CMT_INLINE void block_process_impl(size_t& i, size_t size, Fn&& fn) { CMT_LOOP_NOUNROLL for (; i < size / width * width; i += width) - fn(i, csize<width>); + fn(i, csize_t<width>()); } } @@ -388,4 +352,4 @@ CMT_INLINE void block_process(size_t size, csizes_t<widths...>, Fn&& fn) } } -#pragma GCC diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp @@ -32,6 +32,9 @@ #include "read_write.hpp" #include "types.hpp" +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4324)) + namespace kfr { @@ -136,7 +139,7 @@ struct univector_base : input_expression, output_expression { ringbuf_write(cursor, x.data(), N); } - void ringbuf_write(size_t& cursor, const T value) + void ringbuf_write(size_t& cursor, const T& value) { T* data = get_data(); data[cursor] = value; @@ -205,8 +208,8 @@ private: }; template <typename T, size_t Size> -struct alignas(maximum_vector_alignment) univector : std::array<T, Size>, - univector_base<T, univector<T, Size>> +struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, Size>, + univector_base<T, univector<T, Size>> { using std::array<T, Size>::size; using size_type = size_t; @@ -216,7 +219,8 @@ struct alignas(maximum_vector_alignment) univector : std::array<T, Size>, this->assign_expr(std::forward<Input>(input)); } template <typename... Args> - constexpr univector(T x, Args... args) noexcept : std::array<T, Size>{ { x, static_cast<T>(args)... } } + constexpr univector(const T& x, const Args&... args) noexcept + : std::array<T, Size>{ { x, static_cast<T>(args)... } } { } @@ -344,3 +348,5 @@ CMT_INLINE univector<T> render(Expr&& expr, size_t size) return result; } } + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp @@ -27,15 +27,15 @@ #include "kfr.h" -#include "types.hpp" - +#include "constants.hpp" #include "simd.hpp" +#include "types.hpp" -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wfloat-equal" -#pragma clang diagnostic ignored "-Wc++98-compat-local-type-template-args" -#pragma clang diagnostic ignored "-Wshadow" -#pragma clang diagnostic ignored "-Wpacked" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked") namespace kfr { @@ -139,6 +139,12 @@ struct mask; namespace internal { +template <typename T> +constexpr inline T maskbits(bool value) +{ + return value ? constants<T>::allones() : T(); +} + template <typename T, size_t N> struct flt_type_impl<vec<T, N>> { @@ -175,38 +181,36 @@ constexpr CMT_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept namespace internal { template <typename Fn, size_t index> -constexpr enable_if<std::is_same<size_t, decltype(std::declval<Fn>().operator()(size_t()))>::value, size_t> -get_vec_index() +constexpr size_t get_vec_index() noexcept { +#ifdef CMT_COMPILER_GNU constexpr Fn fn{}; - return fn(index); + return fn(csize_t<index>()); +#else + return Fn()(csize_t<index>()); +#endif } -template <typename Fn, size_t index> -constexpr enable_if< - std::is_same<size_t, decltype(std::declval<Fn>().template operator() < index > ())>::value, size_t> -get_vec_index(int = 0) +constexpr int vindex(size_t index) { return index == index_undefined ? -1 : static_cast<int>(index); } + +template <typename T, size_t N, size_t... Indices, typename = enable_if<!(is_compound<T>::value)>, + typename = void> +CMT_INLINE vec<T, sizeof...(Indices)> shufflevector_i_t(csizes_t<Indices...>, const vec<T, N>& x, + const vec<T, N>& y) { - constexpr Fn fn{}; - return fn.template operator()<index>(); + vec<T, sizeof...(Indices)> result = KFR_SIMD_SHUFFLE(*x, *y, vindex(Indices)...); + return result; } -template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(!is_compound<T>::value)> -CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...>, const vec<T, N>& x, - const vec<T, N>& y) +constexpr size_t vinflate_index(size_t counter, size_t groupsize, size_t index) { - vec<T, sizeof...(Indices)> result = KFR_SIMD_SHUFFLE( - *x, *y, static_cast<intptr_t>(Indices == index_undefined ? -1 : static_cast<intptr_t>(Indices))...); - return result; + return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index); } template <size_t counter, size_t groupsize, size_t... indices> constexpr size_t inflate_get_index() { - constexpr csizes_t<indices...> ind{}; - return (ind.get(csize<counter / groupsize>) == index_undefined - ? index_undefined - : (counter % groupsize + groupsize * ind.get(csize<counter / groupsize>))); + return vinflate_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>())); } template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)> @@ -215,29 +219,25 @@ constexpr auto inflate_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) { return {}; } -} - -namespace internal -{ template <size_t groupsize, size_t... indices> constexpr auto inflate(csize_t<groupsize>, csizes_t<indices...>) { - return inflate_impl(csizes<indices...>, csizeseq<sizeof...(indices)*groupsize>); + return inflate_impl(csizes_t<indices...>(), csizeseq_t<sizeof...(indices)*groupsize>()); } -template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(is_compound<T>::value)> -CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...> indices, const vec<T, N>& x, - const vec<T, N>& y) +template <typename T, size_t N, size_t... Indices, typename = enable_if<(is_compound<T>::value)>> +CMT_INLINE vec<T, sizeof...(Indices)> shufflevector_i_t(csizes_t<Indices...> indices, const vec<T, N>& x, + const vec<T, N>& y) { - return compcast<T>(shufflevector(inflate(csize<widthof<T>()>, indices), compcast<subtype<T>>(x), - compcast<subtype<T>>(y))); + return compcast<T>(shufflevector_i_t(inflate(csize_t<widthof<T>()>(), indices), compcast<subtype<T>>(x), + compcast<subtype<T>>(y))); } template <size_t... Indices, size_t Nout = sizeof...(Indices), typename T, size_t N> -CMT_INLINE vec<T, Nout> shufflevector(csizes_t<Indices...>, const vec<T, N>& x) +CMT_INLINE vec<T, Nout> shufflevector_i(csizes_t<Indices...>, const vec<T, N>& x) { - return internal::shufflevector<T, N>(csizes<Indices...>, x, x); + return internal::shufflevector_i_t<T, N>(csizes_t<Indices...>(), x, x); } template <typename Fn, size_t groupsize, typename T, size_t N, size_t... Indices, @@ -245,23 +245,25 @@ template <typename Fn, size_t groupsize, typename T, size_t N, size_t... Indices CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y, cvals_t<size_t, Indices...>) { static_assert(N % groupsize == 0, "N % groupsize == 0"); - return internal::shufflevector<T, N>( - csizes<(get_vec_index<Fn, Indices / groupsize>() * groupsize + Indices % groupsize)...>, x, y); + constexpr csizes_t<(get_vec_index<Fn, Indices / groupsize>() * groupsize + Indices % groupsize)...> + vindices{}; + return internal::shufflevector_i_t<T, N>(vindices, x, y); } } template <size_t Nout, typename Fn, size_t groupsize = 1, typename T, size_t N> CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x, const vec<T, N>& y) { - return internal::shufflevector<Fn, groupsize>(x, y, csizeseq<Nout>); + return internal::shufflevector<Fn, groupsize>(x, y, cvalseq_t<size_t, Nout>()); } template <size_t Nout, typename Fn, size_t groupsize = 1, typename T, size_t N> CMT_INLINE vec<T, Nout> shufflevector(const vec<T, N>& x) { - return internal::shufflevector<Fn, groupsize>(x, x, csizeseq<Nout>); + return internal::shufflevector<Fn, groupsize>(x, x, cvalseq_t<size_t, Nout>()); } +#ifdef KFR_ENABLE_SWIZZLE namespace swizzle { template <size_t> @@ -300,9 +302,10 @@ constexpr swiz<13> s13{}; constexpr swiz<14> s14{}; constexpr swiz<15> s15{}; } +#endif -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast") template <size_t N, typename T> constexpr CMT_INLINE vec<T, N> broadcast(T x) @@ -310,7 +313,7 @@ constexpr CMT_INLINE vec<T, N> broadcast(T x) return x; } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) namespace internal { @@ -381,14 +384,15 @@ constexpr CMT_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& value) noexcept return *value; } -template <typename To, typename From, typename Tout = deep_rebind<From, To>> +template <typename To, typename From, + typename Tout = typename compound_type_traits<From>::template deep_rebind<To>> constexpr CMT_INLINE Tout cast(const From& value) noexcept { return static_cast<Tout>(value); } template <typename To, typename From> -constexpr CMT_INLINE To bitcast(const From& value) noexcept +CMT_GNU_CONSTEXPR CMT_INLINE To bitcast(const From& value) noexcept { static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types"); union { @@ -399,7 +403,7 @@ constexpr CMT_INLINE To bitcast(const From& value) noexcept } template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()> -constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept +CMT_GNU_CONSTEXPR CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept { using Tsub = typename vec<To, Nout>::scalar_type; constexpr size_t width = vec<To, Nout>::scalar_size(); @@ -407,7 +411,7 @@ constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept } template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()> -constexpr CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept +CMT_GNU_CONSTEXPR CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept { using Tsub = typename mask<To, Nout>::scalar_type; constexpr size_t width = mask<To, Nout>::scalar_size(); @@ -456,7 +460,7 @@ constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); } template <typename T, size_t N, size_t... Sizes> -CMT_INLINE vec<T, N + csum(csizes<Sizes...>)> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest); +CMT_INLINE vec<T, csum<size_t, N, Sizes...>()> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest); namespace internal { @@ -567,7 +571,7 @@ CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, co constexpr size_t width = compound_type_traits<T>::width; const T list[] = { args... }; using simd_t = typename vec<T, N>::simd_t; - return simd_t{ compound_type_traits<T>::at(list[indices / width], indices % width)... }; + return KFR_SIMD_SET(simd_t, compound_type_traits<T>::at(list[indices / width], indices % width)...); } } @@ -599,7 +603,7 @@ template <typename Type = void, typename Arg, typename... Args, size_t N = (size KFR_ENABLE_IF(is_number<subtype<SubType>>::value)> constexpr CMT_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest) { - return internal::make_vector_impl<SubType>(csizeseq<N * widthof<SubType>()>, static_cast<SubType>(x), + return internal::make_vector_impl<SubType>(csizeseq_t<N * widthof<SubType>()>(), static_cast<SubType>(x), static_cast<SubType>(rest)...); } KFR_FN(pack) @@ -616,11 +620,19 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty { static_assert(N > 0 && N <= 256, "Invalid vector size"); + static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value || + std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value || + std::is_same<T, short>::value || std::is_same<T, unsigned short>::value || + std::is_same<T, int>::value || std::is_same<T, unsigned int>::value || + std::is_same<T, long>::value || std::is_same<T, unsigned long>::value || + std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value || + !compound_type_traits<T>::is_scalar, + "Invalid vector type"); + static_assert(!is_vec<T>::value || is_poweroftwo(size_of<T>()), "Inner vector size must be a power of two"); constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; } - using UT = utype<T>; using value_type = T; using scalar_type = subtype<T>; using simd_t = simd<scalar_type, N * compound_type_traits<T>::width>; @@ -630,34 +642,43 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty constexpr static bool is_pod = true; constexpr CMT_INLINE vec() noexcept {} - constexpr CMT_INLINE vec(simd_t value) noexcept : v(value) {} + constexpr CMT_INLINE vec(const simd_t& value) noexcept : v(value) {} + constexpr CMT_INLINE vec(const vec&) noexcept = default; + constexpr CMT_INLINE vec(vec&&) noexcept = default; + CMT_INLINE vec& operator=(const vec&) noexcept = default; + CMT_INLINE vec& operator=(vec&&) noexcept = default; + template <typename U, - KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width > 1)> + typename = enable_if<(std::is_convertible<U, T>::value && compound_type_traits<T>::width > 1)>> constexpr CMT_INLINE vec(const U& value) noexcept : v(*resize<scalar_size()>(bitcast<scalar_type>(make_vector(static_cast<T>(value))))) { } template <typename U, - KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width == 1)> + typename = enable_if<(std::is_convertible<U, T>::value && compound_type_traits<T>::width == 1)>, + typename = void> constexpr CMT_INLINE vec(const U& value) noexcept : v(KFR_SIMD_BROADCAST(T, N, static_cast<T>(value))) { } + + template <typename U, + typename = enable_if<std::is_convertible<U, T>::value && !std::is_same<vec<U, N>, T>::value>> + CMT_GNU_CONSTEXPR CMT_INLINE vec(const vec<U, N>& value) noexcept + : v(*internal::conversion<vec<T, N>, vec<U, N>>::cast(value)) + { + } template <typename... Ts> - constexpr CMT_INLINE vec(const T& x, const T& y, const Ts&... rest) noexcept - : v(*make_vector<T>(x, y, rest...)) + CMT_GNU_CONSTEXPR CMT_INLINE vec(const T& x, const T& y, const Ts&... rest) noexcept + : v(*make_vector<T>(x, y, static_cast<T>(rest)...)) { static_assert(N <= 2 + sizeof...(Ts), "Too few initializers for vec"); } template <size_t N1, size_t N2, size_t... Ns> - constexpr CMT_INLINE vec(const vec<T, N1>& v1, const vec<T, N2>& v2, - const vec<T, Ns>&... vectors) noexcept : v(*concat(v1, v2, vectors...)) + CMT_GNU_CONSTEXPR CMT_INLINE vec(const vec<T, N1>& v1, const vec<T, N2>& v2, + const vec<T, Ns>&... vectors) noexcept : v(*concat(v1, v2, vectors...)) { - static_assert(csum(csizes<N1, N2, Ns...>) == N, "Can't concat vectors: invalid csizes"); + static_assert(csum(csizes_t<N1, N2, Ns...>()) == N, "Can't concat vectors: invalid csizes"); } - constexpr CMT_INLINE vec(const vec&) noexcept = default; - constexpr CMT_INLINE vec(vec&&) noexcept = default; - CMT_INLINE vec& operator=(const vec&) noexcept = default; - CMT_INLINE vec& operator=(vec&&) noexcept = default; friend CMT_INLINE vec operator-(const vec& x) { return vec_op<T, N>::neg(x.v); } friend CMT_INLINE vec operator~(const vec& x) { return vec_op<T, N>::bnot(x.v); } @@ -701,11 +722,11 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty using array_t = T (&)[N]; CMT_INLINE array_t arr() { return ref_cast<array_t>(v); } - template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value && !std::is_same<U, vec>::value)> - CMT_INLINE constexpr operator vec<U, N>() const noexcept + /*template <typename U, KFR_ENABLE_IF(std::is_convertible<T, U>::value && !std::is_same<U, vec>::value)> + CMT_INLINE operator vec<U, N>() const noexcept { return internal::conversion<vec<U, N>, vec<T, N>>::cast(*this); - } + }*/ private: struct getter_setter; @@ -815,11 +836,12 @@ struct mask : public vec<T, N> using type = T; constexpr static size_t width = N; - using base = vec<T, N>; + using simd_t = typename vec<T, N>::simd_t; + using base = vec<T, N>; constexpr CMT_INLINE mask() noexcept : base() {} - constexpr CMT_INLINE mask(simd<T, N> value) noexcept : base(value) {} + constexpr CMT_INLINE mask(const simd_t& value) noexcept : base(value) {} template <size_t N1, size_t... Ns> constexpr CMT_INLINE mask(const mask<T, N1>& mask1, const mask<T, Ns>&... masks) noexcept : base(*concat(mask1, masks...)) @@ -919,32 +941,34 @@ constexpr mask<T, Nout> partial_mask_helper(csizes_t<indices...>) template <typename T, size_t Nout, size_t N1> constexpr mask<T, Nout> partial_mask() { - return internal::partial_mask_helper<T, Nout, N1>(csizeseq<Nout>); + return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>()); } template <typename T, size_t N> -CMT_INLINE vec<T, N> concat(const vec<T, N>& x) +CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x) { return x; } template <typename T, size_t N1, size_t N2> -CMT_INLINE vec<T, N1 + N2> concat(const vec<T, N1>& x, const vec<T, N2>& y) +CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y) { return concattwo<0, N1 + N2>(x, y); } template <typename T, size_t N1, size_t N2, size_t... Sizes> -CMT_INLINE auto concat(const vec<T, N1>& x, const vec<T, N2>& y, const vec<T, Sizes>&... args) +CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y, + const vec<T, Sizes>&... args) { - return concat(x, concat(y, args...)); + return concat_impl(concat_impl(x, y), args...); + // return concat(x, concat(y, args...)); } } template <typename T, size_t N, size_t... Sizes> -CMT_INLINE vec<T, N + csum(csizes<Sizes...>)> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest) +CMT_INLINE vec<T, csum<size_t, N, Sizes...>()> concat(const vec<T, N>& x, const vec<T, Sizes>&... rest) { - return internal::concat(x, rest...); + return internal::concat_impl(x, rest...); } KFR_FN(concat) @@ -1239,13 +1263,14 @@ template <typename T, size_t N, typename Fn, typename... Args, typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>> constexpr CMT_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args) { - return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>, arg, std::forward<Args>(args)...); + return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>(), arg, + std::forward<Args>(args)...); } template <size_t N, typename Fn, typename T = result_of<Fn()>> constexpr CMT_INLINE vec<T, N> apply(Fn&& fn) { - return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>); + return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>()); } template <typename T, int N> @@ -1273,26 +1298,26 @@ constexpr CMT_INLINE mask<T, Nout> make_mask(bool arg, Args... args) KFR_FN(make_mask) template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> zerovector() +CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector() { constexpr size_t width = N * compound_type_traits<T>::width; return compcast<T>(vec<subtype<T>, width>(simd<subtype<T>, width>())); } template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> zerovector(vec_t<T, N>) +CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector(vec_t<T, N>) { return zerovector<T, N>(); } KFR_FN(zerovector) template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> allonesvector() +CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector() { return zerovector<T, N>() == zerovector<T, N>(); } template <typename T, size_t N> -constexpr CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>) +CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>) { return allonesvector<T, N>(); } @@ -1337,7 +1362,7 @@ KFR_FN(low) KFR_FN(high) } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) namespace cometa { @@ -1355,7 +1380,7 @@ struct compound_type_traits<kfr::vec_t<T, N>> template <typename U> using rebind = kfr::vec_t<U, N>; template <typename U> - using deep_rebind = kfr::vec_t<cometa::deep_rebind<subtype, U>, N>; + using deep_rebind = kfr::vec_t<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; }; template <typename T, size_t N> @@ -1370,7 +1395,7 @@ struct compound_type_traits<kfr::vec<T, N>> template <typename U> using rebind = kfr::vec<U, N>; template <typename U> - using deep_rebind = kfr::vec<cometa::deep_rebind<subtype, U>, N>; + using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; CMT_INLINE static constexpr subtype at(const kfr::vec<T, N>& value, size_t index) { return value[index]; } }; @@ -1387,7 +1412,7 @@ struct compound_type_traits<kfr::mask<T, N>> template <typename U> using rebind = kfr::mask<U, N>; template <typename U> - using deep_rebind = kfr::mask<cometa::deep_rebind<subtype, U>, N>; + using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>; CMT_INLINE static constexpr subtype at(const kfr::mask<T, N>& value, size_t index) { @@ -1395,6 +1420,7 @@ struct compound_type_traits<kfr::mask<T, N>> } }; } + namespace std { template <typename T1, typename T2, size_t N> @@ -1412,6 +1438,16 @@ struct common_type<T1, kfr::vec<T2, N>> { using type = kfr::vec<typename common_type<T1, T2>::type, N>; }; +template <typename T1, typename T2, size_t N1, size_t N2> +struct common_type<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>> +{ + using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>; +}; +template <typename T1, typename T2, size_t N1, size_t N2> +struct common_type<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>> +{ + using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>; +}; template <typename T1, typename T2, size_t N> struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>> diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -205,6 +205,11 @@ extern char* gets(char* __s); #if defined(__GNUC__) || defined(__clang__) // GCC, Clang #define CMT_COMPILER_GNU 1 + +#if !defined(__clang__) // GCC only +#define CMT_COMPILER_GCC 1 +#endif + #define CMT_GNU_ATTRIBUTES 1 #define CMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) #if __cplusplus >= 201103L || defined __GXX_EXPERIMENTAL_CXX0X__ @@ -261,8 +266,9 @@ extern char* gets(char* __s); #endif -#if defined _MSC_VER && _MSC_VER >= 1900 && defined(__clang__) && \ - (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9)) +#if defined _MSC_VER && _MSC_VER >= 1900 && \ + (!defined(__clang__) || \ + (defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9)))) #define CMT_EMPTY_BASES __declspec(empty_bases) #else #define CMT_EMPTY_BASES @@ -389,10 +395,12 @@ extern char* gets(char* __s); #define CMT_HAS_EXCEPTIONS 1 #endif +#if defined __has_include #if __has_include(<assert.h>) #include <assert.h> #define CMT_HAS_ASSERT_H 1 #endif +#endif #ifndef CMT_THROW #if CMT_HAS_EXCEPTIONS @@ -431,8 +439,46 @@ extern char* gets(char* __s); #define CMT_FAST_CC __attribute__((fastcall)) #define CMT_UNUSED __attribute__((unused)) #define CMT_GNU_CONSTEXPR constexpr +#define CMT_GNU_PACKED __attribute__((packed)) +#define CMT_PRAGMA_PACK_PUSH_1 +#define CMT_PRAGMA_PACK_POP +#define CMT_FP(h, d) h +#define CMT_PRAGMA_GNU(...) _Pragma(#__VA_ARGS__) +#ifdef CMT_COMPILER_CLANG +#define CMT_PRAGMA_CLANG(...) _Pragma(#__VA_ARGS__) +#else +#define CMT_PRAGMA_CLANG(...) +#endif +#ifdef CMT_COMPILER_CLANG +#define CMT_PRAGMA_GCC(...) _Pragma(#__VA_ARGS__) +#else +#define CMT_PRAGMA_GCC(...) +#endif +#define CMT_PRAGMA_MSVC(...) #else #define CMT_FAST_CC __fastcall #define CMT_UNUSED #define CMT_GNU_CONSTEXPR +#define CMT_GNU_PACKED +#define CMT_PRAGMA_PACK_PUSH_1 __pragma(pack(push, 1)) +#define CMT_PRAGMA_PACK_POP __pragma(pack(pop)) +#define CMT_FP(h, d) d +#define CMT_PRAGMA_GNU(...) +#define CMT_PRAGMA_CLANG(...) +#define CMT_PRAGMA_GCC(...) +#define CMT_PRAGMA_MSVC(...) __pragma(__VA_ARGS__) +#endif + +#if defined CMT_COMPILER_CLANG +#if defined _MSC_VER +#define CMT_COMPIER_NAME "clang-msvc" +#else +#define CMT_COMPIER_NAME "clang" +#endif +#elif defined CMT_COMPILER_GCC +#define CMT_COMPIER_NAME "gcc" +#elif defined CMT_COMPILER_MSVC +#define CMT_COMPIER_NAME "msvc" +#else +#define CMT_COMPIER_NAME "unknown" #endif diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp @@ -10,8 +10,11 @@ #include <type_traits> #include <utility> -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshadow" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4814)) namespace cometa { @@ -228,11 +231,12 @@ using subtype = typename compound_type_traits<T>::subtype; template <typename T> using deep_subtype = typename compound_type_traits<T>::deep_subtype; -template <typename T, typename SubType> +/*template <typename T, typename SubType> using rebind_subtype = typename compound_type_traits<T>::template rebind<SubType>; template <typename T, typename SubType> using deep_rebind = typename compound_type_traits<T>::template deep_rebind<SubType>; + */ template <typename T> struct compound_type_traits<std::pair<T, T>> @@ -247,7 +251,8 @@ struct compound_type_traits<std::pair<T, T>> template <typename U> using rebind = std::pair<U, U>; template <typename U> - using deep_rebind = std::pair<cometa::deep_rebind<subtype, U>, cometa::deep_rebind<subtype, U>>; + using deep_rebind = std::pair<typename compound_type_traits<subtype>::template deep_rebind<U>, + typename compound_type_traits<subtype>::template deep_rebind<U>>; CMT_INLINE static constexpr const subtype& at(const std::pair<subtype, subtype>& value, size_t index) { @@ -346,27 +351,12 @@ using cuint_t = cval_t<unsigned, val>; template <size_t val> using csize_t = cval_t<size_t, val>; -template <typename T, T val> -constexpr cval_t<T, val> cval{}; - -template <bool val> -constexpr cbool_t<val> cbool{}; - using cfalse_t = cbool_t<false>; using ctrue_t = cbool_t<true>; constexpr ctrue_t ctrue{}; constexpr cfalse_t cfalse{}; -template <int val> -constexpr cint_t<val> cint{}; - -template <unsigned val> -constexpr cuint_t<val> cuint{}; - -template <size_t val> -constexpr csize_t<val> csize{}; - namespace details { template <size_t index, typename T, T first, T... rest> @@ -406,17 +396,17 @@ struct cvals_t : ops::empty using type = cvals_t<T, values...>; constexpr static size_t size() { return sizeof...(values); } template <size_t index> - constexpr T operator[](csize_t<index>) + constexpr T operator[](csize_t<index>) const { - return get(csize<index>); + return get(csize_t<index>()); } template <size_t index> constexpr static T get(csize_t<index> = csize_t<index>()) { return details::get_nth<index, T, values...>::value; } - constexpr static T front() { return get(csize<0>); } - constexpr static T back() { return get(csize<size() - 1>); } + constexpr static T front() { return get(csize_t<0>()); } + constexpr static T back() { return get(csize_t<size() - 1>()); } static const T* begin() { return array(); } static const T* end() { return array() + size(); } @@ -444,6 +434,8 @@ struct cvals_t<T> : ops::empty template <bool... values> using cbools_t = cvals_t<bool, values...>; +constexpr cbools_t<false, true> cfalse_true{}; + template <int... values> using cints_t = cvals_t<int, values...>; @@ -459,39 +451,16 @@ using csizes_t = cvals_t<size_t, values...>; template <size_t... values> using elements_t = cvals_t<size_t, values...>; -template <typename T, T... values> -constexpr cvals_t<T, values...> cvals{}; - -template <bool... vals> -constexpr cbools_t<vals...> cbools{}; - -constexpr cbools_t<false, true> cfalse_true{}; - -template <int... vals> -constexpr cints_t<vals...> cints{}; - -template <char... vals> -constexpr cchars_t<vals...> cchars{}; - -template <unsigned... vals> -constexpr cuints_t<vals...> cuints{}; - -template <size_t... vals> -constexpr csizes_t<vals...> csizes{}; - -template <size_t... vals> -constexpr elements_t<vals...> elements{}; - template <typename T> -constexpr inline T csum(cvals_t<T>) +constexpr inline T csum(cvals_t<T> = cvals_t<T>()) { return 0; } template <typename T, T first, T... rest> -constexpr inline T csum(cvals_t<T, first, rest...>) +constexpr inline T csum(cvals_t<T, first, rest...> = cvals_t<T, first, rest...>()) { - return first + csum(cvals<T, rest...>); + return first + csum(cvals_t<T, rest...>()); } template <typename T> @@ -503,7 +472,7 @@ constexpr inline T cprod(cvals_t<T>) template <typename T, T first, T... rest> constexpr inline T cprod(cvals_t<T, first, rest...>) { - return first * cprod(cvals<T, rest...>); + return first * cprod(cvals_t<T, rest...>()); } template <typename T> @@ -515,9 +484,6 @@ struct ctype_t template <typename T> using type_of = typename T::type; -template <typename T> -constexpr ctype_t<T> ctype{}; - template <typename... Types> struct ctypes_t { @@ -533,8 +499,6 @@ struct ctypes_t } }; -template <typename... Ts> -constexpr ctypes_t<Ts...> ctypes{}; namespace details { template <typename T1, typename T2> @@ -711,36 +675,12 @@ struct cvalseq_impl<T, 1, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart)> template <typename T, size_t size, T start = T(), ptrdiff_t step = 1> using cvalseq_t = typename details::cvalseq_impl<T, size, start, step>::type; -template <typename T, T begin, T end> -constexpr cvalseq_t<T, end - begin, begin> cvalrange{}; - -template <size_t begin, size_t end> -constexpr cvalseq_t<size_t, end - begin, begin> csizerange{}; - -template <int begin, int end> -constexpr cvalseq_t<int, end - begin, begin> cintrange{}; - -template <unsigned begin, unsigned end> -constexpr cvalseq_t<unsigned, end - begin, begin> cuintrange{}; - -template <typename T, size_t size, T start = T(), ptrdiff_t step = 1> -constexpr cvalseq_t<T, size, start, step> cvalseq{}; - template <size_t size, size_t start = 0, ptrdiff_t step = 1> -constexpr cvalseq_t<size_t, size, start, step> csizeseq{}; - -template <size_t size, int start = 0, ptrdiff_t step = 1> -constexpr cvalseq_t<int, size, start, step> cintseq{}; - -template <size_t size, unsigned start = 0, ptrdiff_t step = 1> -constexpr cvalseq_t<unsigned, size, start, step> cuintseq{}; +using csizeseq_t = cvalseq_t<size_t, size, start, step>; template <typename... List> using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>; -template <typename... List> -constexpr indicesfor_t<List...> indicesfor{}; - namespace details { @@ -775,21 +715,27 @@ struct is_enabled_impl<Fn, void_t<decltype(Fn::disabled)>> : std::integral_const { }; -template <size_t N> +template <int N> struct unique_enum_impl { - enum class type : size_t + enum type : int { value = N }; }; -template <size_t N> -using unique_enum = typename unique_enum_impl<N>::type; +#ifdef CMT_COMPILER_MSVC +#define CMT_ENABLE_IF_IMPL(N, ...) \ + bool enable_ = (__VA_ARGS__), typename enabled_ = ::std::enable_if<enable_>::type, \ + typename cometa::details::unique_enum_impl<N>::type dummy_ = \ + ::cometa::details::unique_enum_impl<N>::value + +#else #define CMT_ENABLE_IF_IMPL(N, ...) \ - typename ::std::enable_if<(__VA_ARGS__), ::cometa::details::unique_enum<N>>::type = \ - ::cometa::details::unique_enum<N>::value + typename ::std::enable_if<(__VA_ARGS__), typename ::cometa::details::unique_enum_impl<N>::type>::type = \ + ::cometa::details::unique_enum_impl<N>::value +#endif #define CMT_ENABLE_IF(...) CMT_ENABLE_IF_IMPL(__LINE__, __VA_ARGS__) } @@ -1020,10 +966,16 @@ struct identity_impl }; template <typename T> -constexpr size_t elementsize = sizeof(T); +constexpr size_t elementsize() +{ + return sizeof(T); +} template <> -constexpr size_t elementsize<void> = 1; +constexpr size_t elementsize<void>() +{ + return 1; +}; } template <typename T> @@ -1048,7 +1000,7 @@ struct carray<T, 1> template <typename Fn, size_t index = 0, CMT_ENABLE_IF(is_callable<Fn, csize_t<index>>::value)> CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept - : val(static_cast<T>(fn(csize<index>))) + : val(static_cast<T>(fn(csize_t<index>()))) { } @@ -1071,12 +1023,12 @@ struct carray<T, 1> template <size_t index> CMT_INTRIN constexpr T& get() noexcept { - return get(csize<index>); + return get(csize_t<index>()); } template <size_t index> CMT_INTRIN constexpr const T& get() const noexcept { - return get(csize<index>); + return get(csize_t<index>()); } CMT_INTRIN constexpr const T* front() const noexcept { return val; } CMT_INTRIN constexpr T* front() noexcept { return val; } @@ -1103,8 +1055,8 @@ struct carray : carray<T, N - 1> template <typename Fn, size_t index = N - 1> CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept - : carray<T, N - 1>(std::forward<Fn>(fn), csize<index - 1>), - val(static_cast<T>(fn(csize<index>))) + : carray<T, N - 1>(std::forward<Fn>(fn), csize_t<index - 1>()), + val(static_cast<T>(fn(csize_t<index>()))) { } @@ -1116,23 +1068,23 @@ struct carray : carray<T, N - 1> template <size_t index> CMT_INTRIN constexpr T& get(csize_t<index>) noexcept { - return carray<T, N - 1>::get(csize<index>); + return carray<T, N - 1>::get(csize_t<index>()); } CMT_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; } template <size_t index> CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept { - return carray<T, N - 1>::get(csize<index>); + return carray<T, N - 1>::get(csize_t<index>()); } template <size_t index> CMT_INTRIN constexpr T& get() noexcept { - return get(csize<index>); + return get(csize_t<index>()); } template <size_t index> CMT_INTRIN constexpr const T& get() const noexcept { - return get(csize<index>); + return get(csize_t<index>()); } CMT_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); } CMT_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); } @@ -1307,10 +1259,25 @@ using has_data_size = details::has_data_size_impl<decay<T>>; template <typename T> using value_type_of = typename decay<T>::value_type; +#ifndef CMT_COMPILER_CLANG +namespace details +{ +template <typename T, T value, typename Fn> +void cforeach_impl(Fn&& fn) +{ + fn(cval_t<T, value>()); +} +} +#endif + template <typename T, T... values, typename Fn> CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn) { - swallow{ (fn(cval<T, values>), void(), 0)... }; +#ifdef CMT_COMPILER_CLANG + swallow{ (fn(cval_t<T, values>()), void(), 0)... }; +#else + swallow{ (details::cforeach_impl<T, values>(std::forward<Fn>(fn)), void(), 0)... }; +#endif } template <typename T, typename Fn, CMT_ENABLE_IF(has_begin_end<T>::value)> @@ -1335,9 +1302,9 @@ namespace details { template <size_t index, typename... types> -CMT_INTRIN auto get_type_arg(ctypes_t<types...> type_list) +CMT_INTRIN auto get_type_arg(ctypes_t<types...>) { - return ctype<type_of<details::get_nth_type<index, types...>>>; + return ctype_t<type_of<details::get_nth_type<index, types...>>>(); } template <typename T0, typename... types, typename Fn, size_t... indices> @@ -1350,7 +1317,7 @@ CMT_INTRIN void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, c template <typename... Ts, typename Fn> CMT_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn) { - details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq<sizeof...(Ts)>); + details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq_t<sizeof...(Ts)>()); } template <typename A0, typename A1, typename Fn> @@ -1371,26 +1338,26 @@ CMT_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn) template <typename TrueFn, typename FalseFn = fn_noop> CMT_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn()) { - return truefn(cbool<true>); + return truefn(ctrue); } template <typename TrueFn, typename FalseFn = fn_noop> CMT_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn()) { - return falsefn(cbool<false>); + return falsefn(cfalse); } template <typename T, T start, T stop, typename BodyFn> CMT_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn) { - return cforeach(cvalrange<T, start, stop>, std::forward<BodyFn>(bodyfn)); + return cforeach(cvalseq_t<T, stop - start, start>(), std::forward<BodyFn>(bodyfn)); } template <typename T, T... vs, typename U, typename Function, typename Fallback = fn_noop> void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& fallback = Fallback()) { bool result = false; - swallow{ (result = result || ((vs == value) ? (function(cval<T, vs>), void(), true) : false), void(), + swallow{ (result = result || ((vs == value) ? (function(cval_t<T, vs>()), void(), true) : false), void(), 0)... }; if (!result) fallback(); @@ -1408,7 +1375,7 @@ CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, { if (cmpfn(value, v0)) { - return fn(cval<T, v0>); + return fn(cval_t<T, v0>()); } else { @@ -1442,7 +1409,7 @@ inline decltype(auto) cmatch_impl(T&& value, Fn1&& first, Fn2&& second, Fns&&... { using first_arg = typename function_arguments<Fn1>::template nth<0>; constexpr bool is_same = std::is_same<decay<T>, decay<first_arg>>::value; - return cmatch_impl2(cbool<is_same>, std::forward<T>(value), std::forward<Fn1>(first), + return cmatch_impl2(cbool_t<is_same>(), std::forward<T>(value), std::forward<Fn1>(first), std::forward<Fn2>(second), std::forward<Fns>(rest)...); } @@ -1588,9 +1555,9 @@ constexpr inline ptrdiff_t distance(const void* x, const void* y) return static_cast<const unsigned char*>(x) - static_cast<const unsigned char*>(y); } -#pragma GCC diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wundefined-reinterpret-cast") -#pragma GCC diagnostic ignored "-Wundefined-reinterpret-cast" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wundefined-reinterpret-cast") #endif template <typename T, typename U> @@ -1641,7 +1608,80 @@ CMT_INLINE constexpr static T implicit_cast(U&& value) return std::forward<T>(value); } -#pragma GCC diagnostic pop +#ifdef CMT_COMPILER_GNU + +template <typename T, T val> +constexpr cval_t<T, val> cval{}; + +template <bool val> +constexpr cbool_t<val> cbool{}; + +template <int val> +constexpr cint_t<val> cint{}; + +template <unsigned val> +constexpr cuint_t<val> cuint{}; + +template <size_t val> +constexpr csize_t<val> csize{}; + +template <typename T, T... values> +constexpr cvals_t<T, values...> cvals{}; + +template <bool... vals> +constexpr cbools_t<vals...> cbools{}; + +template <int... vals> +constexpr cints_t<vals...> cints{}; + +template <char... vals> +constexpr cchars_t<vals...> cchars{}; + +template <unsigned... vals> +constexpr cuints_t<vals...> cuints{}; + +template <size_t... vals> +constexpr csizes_t<vals...> csizes{}; + +template <size_t... vals> +constexpr elements_t<vals...> elements{}; + +template <typename T> +constexpr ctype_t<T> ctype{}; + +template <typename... Ts> +constexpr ctypes_t<Ts...> ctypes{}; + +template <typename T, T begin, T end> +constexpr cvalseq_t<T, end - begin, begin> cvalrange{}; + +template <size_t begin, size_t end> +constexpr cvalseq_t<size_t, end - begin, begin> csizerange{}; + +template <int begin, int end> +constexpr cvalseq_t<int, end - begin, begin> cintrange{}; + +template <unsigned begin, unsigned end> +constexpr cvalseq_t<unsigned, end - begin, begin> cuintrange{}; + +template <typename T, size_t size, T start = T(), ptrdiff_t step = 1> +constexpr cvalseq_t<T, size, start, step> cvalseq{}; + +template <size_t size, size_t start = 0, ptrdiff_t step = 1> +constexpr cvalseq_t<size_t, size, start, step> csizeseq{}; + +template <size_t size, int start = 0, ptrdiff_t step = 1> +constexpr cvalseq_t<int, size, start, step> cintseq{}; + +template <size_t size, unsigned start = 0, ptrdiff_t step = 1> +constexpr cvalseq_t<unsigned, size, start, step> cuintseq{}; +template <typename... List> +constexpr indicesfor_t<List...> indicesfor{}; +#endif + +CMT_PRAGMA_GNU(GCC diagnostic pop) } -#pragma GCC diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/cometa/array.hpp b/include/kfr/cometa/array.hpp @@ -30,8 +30,13 @@ public: constexpr array_ref() noexcept : m_data(nullptr), m_size(0) {} constexpr array_ref(const array_ref&) noexcept = default; constexpr array_ref(array_ref&&) noexcept = default; +#ifdef CMT_COMPILER_GNU constexpr array_ref& operator=(const array_ref&) noexcept = default; constexpr array_ref& operator=(array_ref&&) noexcept = default; +#else + array_ref& operator=(const array_ref&) = default; + array_ref& operator=(array_ref&&) = default; +#endif template <size_t N> constexpr array_ref(value_type (&arr)[N]) noexcept : m_data(arr), m_size(N) diff --git a/include/kfr/cometa/cstring.hpp b/include/kfr/cometa/cstring.hpp @@ -31,13 +31,13 @@ struct cstring template <size_t start, size_t count> constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const noexcept { - return slice_impl(csizeseq<count, start>); + return slice_impl(csizeseq_t<count, start>()); } template <size_t start> constexpr cstring<N - start> slice(csize_t<start>) const noexcept { - return slice_impl(csizeseq<N - 1 - start, start>); + return slice_impl(csizeseq_t<N - 1 - start, start>()); } constexpr friend bool operator==(const cstring& left, const cstring& right) noexcept @@ -85,14 +85,13 @@ CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring< const cstring<N2>& str2, csizes_t<indices...>) { - constexpr size_t L1 = N1 - 1; - return { { (indices < L1 ? str1[indices] : str2[indices - L1])..., 0 } }; + return { { (indices < (N1 - 1) ? str1[indices] : str2[indices - (N1 - 1)])..., 0 } }; } template <size_t N1, size_t N2, typename... Args> CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<N1>& str1, const cstring<N2>& str2) { - return concat_str_impl(str1, str2, csizeseq<N1 - 1 + N2 - 1>); + return concat_str_impl(str1, str2, cvalseq_t<size_t, N1 - 1 + N2 - 1>()); } template <size_t N1, size_t Nfrom, size_t Nto, size_t... indices> CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str, @@ -125,7 +124,7 @@ CMT_INTRIN constexpr auto concat_cstring(const cstring<N1>& str1, const cstring< template <size_t N> CMT_INTRIN constexpr cstring<N> make_cstring(const char (&str)[N]) { - return details::make_cstring_impl(str, csizeseq<N - 1>); + return details::make_cstring_impl(str, cvalseq_t<size_t, N - 1>()); } template <char... chars> @@ -154,6 +153,7 @@ template <size_t N1, size_t Nfrom, size_t Nto> CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from, const cstring<Nto>& to) { - return details::str_replace_impl(str_find(str, from), str, from, to, csizeseq<N1 - Nfrom + Nto - 1>); + return details::str_replace_impl(str_find(str, from), str, from, to, + cvalseq_t<size_t, N1 - Nfrom + Nto - 1>()); } } diff --git a/include/kfr/cometa/ctti.hpp b/include/kfr/cometa/ctti.hpp @@ -45,9 +45,9 @@ constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) template <typename T> constexpr auto ctype_name() noexcept { - constexpr size_t length = - sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix - details::typename_postfix; - return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix, csizeseq<length>); + return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix, + csizeseq_t<(sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix - + details::typename_postfix)>()); } /** diff --git a/include/kfr/cometa/function.hpp b/include/kfr/cometa/function.hpp @@ -143,7 +143,7 @@ inline function<Ret(Args...)> cdispatch(cvals_t<T, v0, values...>, identity<T> v if (value == v0) { return [=](Args... args) - CMT_INLINE_MEMBER -> Ret { return fn(cval<T, v0>, std::forward<Args>(args)...); }; + CMT_INLINE_MEMBER -> Ret { return fn(cval_t<T, v0>(), std::forward<Args>(args)...); }; } else { diff --git a/include/kfr/cometa/named_arg.hpp b/include/kfr/cometa/named_arg.hpp @@ -5,6 +5,9 @@ #include "../cometa.hpp" +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4814)) + namespace cometa { template <typename T> @@ -28,3 +31,5 @@ struct named inline named operator""_arg(const char* name, size_t) { return name; } } + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/cometa/string.hpp b/include/kfr/cometa/string.hpp @@ -12,10 +12,10 @@ #include <string> #include <utility> -#pragma GCC diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wformat-security") -#pragma GCC diagnostic ignored "-Wformat-security" -#pragma GCC diagnostic ignored "-Wused-but-marked-unused" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wformat-security") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wused-but-marked-unused") #endif namespace cometa @@ -24,11 +24,12 @@ namespace cometa template <typename T> struct representation { + using type = T; static constexpr const T& get(const T& value) noexcept { return value; } }; template <typename T> -using repr_type = decay<decltype(representation<T>::get(std::declval<T>()))>; +using repr_type = typename representation<T>::type; template <typename... Args> CMT_INLINE std::string as_string(const Args&... args); @@ -102,7 +103,7 @@ CMT_INLINE constexpr auto value_fmt(ctype_t<const void*>) { return make_cstring( template <char... chars> CMT_INLINE constexpr auto value_fmt(ctype_t<cchars_t<chars...>>) { - return concat_cstring(make_cstring("%s"), make_cstring(cchars<chars...>)); + return concat_cstring(make_cstring("%s"), make_cstring(cchars_t<chars...>())); } template <typename T> @@ -115,7 +116,7 @@ template <typename T, int width, int prec> CMT_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, static_cast<char>(-1), width, prec>> fmt) { return concat_cstring(make_cstring("%"), value_fmt_arg(fmt), - value_fmt(ctype<repr_type<T>>).slice(csize<1>)); + value_fmt(ctype_t<repr_type<T>>()).slice(csize_t<1>())); } template <typename T, char t, int width, int prec> CMT_INLINE constexpr auto value_fmt(ctype_t<fmt_t<T, t, width, prec>> fmt) @@ -188,7 +189,7 @@ CMT_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>& template <size_t N1, size_t Nto> CMT_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt) { - return fmt_replace_impl(str, newfmt, csizeseq<N1 - 3 + Nto - 1>); + return fmt_replace_impl(str, newfmt, csizeseq_t<N1 - 3 + Nto - 1>()); } inline std::string replace_one(const std::string& str, const std::string& from, const std::string& to) @@ -207,8 +208,8 @@ CMT_INLINE const std::string& build_fmt(const std::string& str, ctypes_t<>) { re template <typename Arg, typename... Args> CMT_INLINE auto build_fmt(const std::string& str, ctypes_t<Arg, Args...>) { - constexpr auto fmt = value_fmt(ctype<decay<Arg>>); - return build_fmt(replace_one(str, "{}", std::string(fmt.data())), ctypes<Args...>); + constexpr auto fmt = value_fmt(ctype_t<decay<Arg>>()); + return build_fmt(replace_one(str, "{}", std::string(fmt.data())), ctypes_t<Args...>()); } } @@ -224,22 +225,24 @@ CMT_INLINE details::fmt_t<T, static_cast<char>(-1), width, prec> fmtwidth(const return { value }; } -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wgnu-string-literal-operator-template" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wgnu-string-literal-operator-template") constexpr auto build_fmt_str(cchars_t<>, ctypes_t<>) { return make_cstring(""); } template <char... chars, typename Arg, typename... Args> constexpr auto build_fmt_str(cchars_t<'@', chars...>, ctypes_t<Arg, Args...>) { - return concat_cstring(details::value_fmt(ctype<decay<Arg>>), - build_fmt_str(cchars<chars...>, ctypes<Args...>)); + return concat_cstring(details::value_fmt(ctype_t<decay<Arg>>()), + build_fmt_str(cchars_t<chars...>(), ctypes_t<Args...>())); } template <char ch, char... chars, typename... Args> constexpr auto build_fmt_str(cchars_t<ch, chars...>, ctypes_t<Args...>) { - return concat_cstring(make_cstring(cchars<ch>), build_fmt_str(cchars<chars...>, ctypes<Args...>)); + return concat_cstring(make_cstring(cchars_t<ch>()), + build_fmt_str(cchars_t<chars...>(), ctypes_t<Args...>())); } template <char... chars> @@ -248,7 +251,7 @@ struct format_t template <typename... Args> inline std::string operator()(const Args&... args) { - constexpr auto format_str = build_fmt_str(cchars<chars...>, ctypes<repr_type<Args>...>); + constexpr auto format_str = build_fmt_str(cchars_t<chars...>(), ctypes_t<repr_type<Args>...>()); std::string result; const int size = std::snprintf(nullptr, 0, format_str.data(), details::pack_value(args)...); @@ -267,12 +270,14 @@ struct print_t template <typename... Args> CMT_INLINE void operator()(const Args&... args) { - constexpr auto format_str = build_fmt_str(cchars<chars...>, ctypes<repr_type<Args>...>); + constexpr auto format_str = build_fmt_str(cchars_t<chars...>(), ctypes_t<repr_type<Args>...>()); std::printf(format_str.data(), details::pack_value(args)...); } }; +#ifdef CMT_COMPILER_GNU + template <typename Char, Char... chars> constexpr format_t<chars...> operator""_format() { @@ -285,26 +290,28 @@ constexpr CMT_INLINE print_t<chars...> operator""_print() return {}; } -#pragma GCC diagnostic pop +#endif + +CMT_PRAGMA_GNU(GCC diagnostic pop) template <typename... Args> CMT_INLINE void printfmt(const std::string& fmt, const Args&... args) { - const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>); + const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>()); std::printf(format_str.data(), details::pack_value(representation<Args>::get(args))...); } template <typename... Args> CMT_INLINE void fprintfmt(FILE* f, const std::string& fmt, const Args&... args) { - const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>); + const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>()); std::fprintf(f, format_str.data(), details::pack_value(representation<Args>::get(args))...); } template <typename... Args> CMT_INLINE int snprintfmt(char* str, size_t size, const std::string& fmt, const Args&... args) { - const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>); + const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>()); return std::snprintf(str, size, format_str.data(), details::pack_value(representation<Args>::get(args))...); } @@ -313,7 +320,7 @@ template <typename... Args> CMT_INLINE std::string format(const std::string& fmt, const Args&... args) { std::string result; - const auto format_str = details::build_fmt(fmt, ctypes<repr_type<Args>...>); + const auto format_str = details::build_fmt(fmt, ctypes_t<repr_type<Args>...>()); const int size = std::snprintf(nullptr, 0, format_str.data(), details::pack_value(representation<Args>::get(args))...); if (size <= 0) @@ -329,22 +336,24 @@ namespace details template <typename T> constexpr auto get_value_fmt() { - return details::value_fmt(ctype<decay<repr_type<T>>>); + return details::value_fmt(ctype_t<decay<repr_type<T>>>()); } } template <typename... Args> CMT_INLINE void print(const Args&... args) { - constexpr auto format_str = concat_cstring(details::get_value_fmt<Args>()...); - std::printf(format_str.data(), details::pack_value(representation<Args>::get(args))...); + constexpr const auto format_str = concat_cstring(details::get_value_fmt<Args>()...); + const char* str = format_str.data(); + std::printf(str, details::pack_value(representation<Args>::get(args))...); } template <typename... Args> CMT_INLINE void println(const Args&... args) { - constexpr auto format_str = concat_cstring(details::get_value_fmt<Args>()..., make_cstring("\n")); - std::printf(format_str.data(), details::pack_value(representation<Args>::get(args))...); + constexpr const auto format_str = concat_cstring(details::get_value_fmt<Args>()..., make_cstring("\n")); + const char* str = format_str.data(); + std::printf(str, details::pack_value(representation<Args>::get(args))...); } template <typename... Args> @@ -352,13 +361,13 @@ CMT_INLINE std::string as_string(const Args&... args) { std::string result; constexpr auto format_str = concat_cstring(details::get_value_fmt<Args>()...); + const char* str = format_str.data(); - const int size = - std::snprintf(nullptr, 0, format_str.data(), details::pack_value(representation<Args>::get(args))...); + const int size = std::snprintf(nullptr, 0, str, details::pack_value(representation<Args>::get(args))...); if (size <= 0) return result; result.resize(size_t(size + 1)); - result.resize(size_t(std::snprintf(&result[0], size_t(size + 1), format_str.data(), + result.resize(size_t(std::snprintf(&result[0], size_t(size + 1), str, details::pack_value(representation<Args>::get(args))...))); return result; } @@ -402,6 +411,7 @@ inline std::string join(T x, U y, Ts... rest) template <typename T> struct representation<named_arg<T>> { + using type = std::string; static std::string get(const named_arg<T>& value) { return std::string(value.name) + " = " + as_string(value.value); @@ -411,6 +421,7 @@ struct representation<named_arg<T>> template <typename T1, typename T2> struct representation<std::pair<T1, T2>> { + using type = std::string; static std::string get(const std::pair<T1, T2>& value) { return "(" + as_string(value.first) + "; " + as_string(value.second) + ")"; @@ -418,4 +429,4 @@ struct representation<std::pair<T1, T2>> }; } -#pragma GCC diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/data/sincos.hpp b/include/kfr/data/sincos.hpp @@ -32,82 +32,160 @@ namespace data { // data generated by mpfr -template <typename T> -constexpr T c_sin_table[64] = { - /* sin(2*pi* 0/ 256) */ T(0.0), - /* sin(2*pi* 1/ 256) */ T(0.02454122852291228803173452945928292506547), - /* sin(2*pi* 2/ 256) */ T(0.04906767432741801425495497694268265831475), - /* sin(2*pi* 3/ 256) */ T(0.0735645635996674235294656215752343218133), - /* sin(2*pi* 4/ 256) */ T(0.09801714032956060199419556388864184586114), - /* sin(2*pi* 5/ 256) */ T(0.1224106751992161984987044741509457875752), - /* sin(2*pi* 6/ 256) */ T(0.1467304744553617516588501296467178197062), - /* sin(2*pi* 7/ 256) */ T(0.1709618887603012263636423572082635319663), - /* sin(2*pi* 8/ 256) */ T(0.1950903220161282678482848684770222409277), - /* sin(2*pi* 9/ 256) */ T(0.2191012401568697972277375474973577988484), - /* sin(2*pi* 10/ 256) */ T(0.242980179903263889948274162077471118321), - /* sin(2*pi* 11/ 256) */ T(0.2667127574748983863252865151164363940421), - /* sin(2*pi* 12/ 256) */ T(0.2902846772544623676361923758173952746915), - /* sin(2*pi* 13/ 256) */ T(0.3136817403988914766564788459941003099934), - /* sin(2*pi* 14/ 256) */ T(0.3368898533922200506892532126191475704778), - /* sin(2*pi* 15/ 256) */ T(0.3598950365349881487751045723267564202023), - /* sin(2*pi* 16/ 256) */ T(0.3826834323650897717284599840303988667613), - /* sin(2*pi* 17/ 256) */ T(0.4052413140049898709084813055050524665119), - /* sin(2*pi* 18/ 256) */ T(0.4275550934302820943209668568887985343046), - /* sin(2*pi* 19/ 256) */ T(0.4496113296546066000462945794242270758832), - /* sin(2*pi* 20/ 256) */ T(0.4713967368259976485563876259052543776575), - /* sin(2*pi* 21/ 256) */ T(0.4928981922297840368730266887588092682397), - /* sin(2*pi* 22/ 256) */ T(0.514102744193221726593693838968815772608), - /* sin(2*pi* 23/ 256) */ T(0.5349976198870972106630769046370179155603), - /* sin(2*pi* 24/ 256) */ T(0.5555702330196022247428308139485328743749), - /* sin(2*pi* 25/ 256) */ T(0.575808191417845300745972453815730841776), - /* sin(2*pi* 26/ 256) */ T(0.5956993044924333434670365288299698895119), - /* sin(2*pi* 27/ 256) */ T(0.6152315905806268454849135634139842776594), - /* sin(2*pi* 28/ 256) */ T(0.6343932841636454982151716132254933706757), - /* sin(2*pi* 29/ 256) */ T(0.6531728429537767640842030136563054150769), - /* sin(2*pi* 30/ 256) */ T(0.6715589548470184006253768504274218032288), - /* sin(2*pi* 31/ 256) */ T(0.6895405447370669246167306299574847028455), - /* sin(2*pi* 32/ 256) */ T(0.7071067811865475244008443621048490392848), - /* sin(2*pi* 33/ 256) */ T(0.7242470829514669209410692432905531674831), - /* sin(2*pi* 34/ 256) */ T(0.740951125354959091175616897495162729729), - /* sin(2*pi* 35/ 256) */ T(0.7572088465064845475754640536057844730404), - /* sin(2*pi* 36/ 256) */ T(0.773010453362736960810906609758469800971), - /* sin(2*pi* 37/ 256) */ T(0.7883464276266062620091647053596892826565), - /* sin(2*pi* 38/ 256) */ T(0.8032075314806449098066765129631419238796), - /* sin(2*pi* 39/ 256) */ T(0.817584813151583696504920884130633809471), - /* sin(2*pi* 40/ 256) */ T(0.8314696123025452370787883776179057567386), - /* sin(2*pi* 41/ 256) */ T(0.8448535652497070732595712051049570977198), - /* sin(2*pi* 42/ 256) */ T(0.8577286100002720699022699842847701370425), - /* sin(2*pi* 43/ 256) */ T(0.8700869911087114186522924044838488439108), - /* sin(2*pi* 44/ 256) */ T(0.8819212643483550297127568636603883495084), - /* sin(2*pi* 45/ 256) */ T(0.8932243011955153203424164474933979780006), - /* sin(2*pi* 46/ 256) */ T(0.9039892931234433315862002972305370487101), - /* sin(2*pi* 47/ 256) */ T(0.9142097557035306546350148293935774010447), - /* sin(2*pi* 48/ 256) */ T(0.9238795325112867561281831893967882868224), - /* sin(2*pi* 49/ 256) */ T(0.932992798834738887711660255543302498295), - /* sin(2*pi* 50/ 256) */ T(0.9415440651830207784125094025995023571856), - /* sin(2*pi* 51/ 256) */ T(0.9495281805930366671959360741893450282522), - /* sin(2*pi* 52/ 256) */ T(0.9569403357322088649357978869802699694828), - /* sin(2*pi* 53/ 256) */ T(0.9637760657954398666864643555078351536631), - /* sin(2*pi* 54/ 256) */ T(0.9700312531945439926039842072861002514569), - /* sin(2*pi* 55/ 256) */ T(0.975702130038528544460395766419527971644), - /* sin(2*pi* 56/ 256) */ T(0.9807852804032304491261822361342390369739), - /* sin(2*pi* 57/ 256) */ T(0.9852776423889412447740184331785477871601), - /* sin(2*pi* 58/ 256) */ T(0.9891765099647809734516737380162430639837), - /* sin(2*pi* 59/ 256) */ T(0.9924795345987099981567672516611178200108), - /* sin(2*pi* 60/ 256) */ T(0.9951847266721968862448369531094799215755), - /* sin(2*pi* 61/ 256) */ T(0.9972904566786902161355971401825678211717), - /* sin(2*pi* 62/ 256) */ T(0.9987954562051723927147716047591006944432), - /* sin(2*pi* 63/ 256) */ T(0.9996988186962042201157656496661721968501) +constexpr f32 c_sin_table_f32[64] = { + /* sin(2*pi* 0/ 256) */ f32(0.0), + /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547), + /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475), + /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133), + /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114), + /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752), + /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062), + /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663), + /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277), + /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484), + /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321), + /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421), + /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915), + /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934), + /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778), + /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023), + /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613), + /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119), + /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046), + /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832), + /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575), + /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397), + /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608), + /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603), + /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749), + /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776), + /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119), + /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594), + /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757), + /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769), + /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288), + /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455), + /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848), + /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831), + /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729), + /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404), + /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971), + /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565), + /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796), + /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471), + /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386), + /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198), + /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425), + /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108), + /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084), + /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006), + /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101), + /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447), + /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224), + /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295), + /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856), + /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522), + /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828), + /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631), + /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569), + /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644), + /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739), + /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601), + /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837), + /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108), + /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755), + /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717), + /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432), + /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501) +}; + +// data generated by mpfr +constexpr f64 c_sin_table_f64[64] = { + /* sin(2*pi* 0/ 256) */ f64(0.0), + /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547), + /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475), + /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133), + /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114), + /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752), + /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062), + /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663), + /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277), + /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484), + /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321), + /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421), + /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915), + /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934), + /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778), + /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023), + /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613), + /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119), + /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046), + /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832), + /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575), + /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397), + /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608), + /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603), + /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749), + /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776), + /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119), + /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594), + /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757), + /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769), + /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288), + /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455), + /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848), + /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831), + /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729), + /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404), + /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971), + /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565), + /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796), + /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471), + /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386), + /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198), + /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425), + /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108), + /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084), + /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006), + /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101), + /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447), + /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224), + /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295), + /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856), + /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522), + /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828), + /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631), + /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569), + /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644), + /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739), + /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601), + /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837), + /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108), + /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755), + /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717), + /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432), + /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501) }; } template <typename T> -constexpr inline T sin_using_table_256(size_t k) +constexpr inline T sin_using_table_256(size_t k); + +template <> +constexpr inline f32 sin_using_table_256<f32>(size_t k) +{ + return (k == 0 || k == 128) ? 0.f : (k == 64) ? 1.f : (k > 128) + ? -sin_using_table_256<f32>(k - 128) + : (k > 64) ? sin_using_table_256<f32>(128 - k) + : data::c_sin_table_f32[k]; +} +template <> +constexpr inline f64 sin_using_table_256<f64>(size_t k) { - return (k == 0 || k == 128) ? T(0) : (k == 64) ? T(1) : (k > 128) - ? -sin_using_table_256<T>(k - 128) - : (k > 64) ? sin_using_table_256<T>(128 - k) - : data::c_sin_table<T>[k]; + return (k == 0 || k == 128) ? 0.0 : (k == 64) ? 1.0 : (k > 128) + ? -sin_using_table_256<f64>(k - 128) + : (k > 64) ? sin_using_table_256<f64>(128 - k) + : data::c_sin_table_f64[k]; } template <typename T> diff --git a/include/kfr/dft/bitrev.hpp b/include/kfr/dft/bitrev.hpp @@ -43,7 +43,7 @@ namespace internal constexpr bool fft_reorder_aligned = false; template <size_t Bits> -constexpr inline u32 bitrev_using_table(u32 x) +CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x) { constexpr size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table)); if (Bits > bitrev_table_log2N) @@ -52,7 +52,7 @@ constexpr inline u32 bitrev_using_table(u32 x) return data::bitrev_table[x] >> (bitrev_table_log2N - Bits); } -constexpr inline u32 bitrev_using_table(u32 x, size_t bits) +CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits) { constexpr size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table)); if (bits > bitrev_table_log2N) @@ -61,7 +61,7 @@ constexpr inline u32 bitrev_using_table(u32 x, size_t bits) return data::bitrev_table[x] >> (bitrev_table_log2N - bits); } -constexpr inline u32 dig4rev_using_table(u32 x, size_t bits) +CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits) { constexpr size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table)); if (bits > bitrev_table_log2N) @@ -253,10 +253,10 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>) } template <typename T, bool use_br2> -void cwrite_reordered(T* out, cvec<T, 16> value, size_t N4, cbool_t<use_br2>) +void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) { - value = digitreverse < use_br2 ? 2 : 4, 2 > (value); - cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, value); + cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, + digitreverse<(use_br2 ? 2 : 4), 2>(value)); } template <typename T, bool use_br2> @@ -265,8 +265,8 @@ KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbo CMT_ASSUME(i != j); const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4); const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4); - cwrite_reordered(inout + j, vi, N4, cbool<use_br2>); - cwrite_reordered(inout + i, vj, N4, cbool<use_br2>); + cwrite_reordered(inout + j, vi, N4, cbool_t<use_br2>()); + cwrite_reordered(inout + i, vj, N4, cbool_t<use_br2>()); } template <typename T> @@ -316,7 +316,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) T* io = ptr_cast<T>(inout); size_t i = 0; -#pragma clang loop unroll_count(2) + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) for (; i < iend;) { size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; @@ -326,7 +326,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) i += istep * 4; } iend += N16; -#pragma clang loop unroll_count(2) + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) for (; i < iend;) { size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; @@ -341,7 +341,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) i += istep * 3; } iend += N16; -#pragma clang loop unroll_count(2) + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) for (; i < iend;) { size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; @@ -361,7 +361,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) i += istep * 2; } iend += N16; -#pragma clang loop unroll_count(2) + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) for (; i < iend;) { size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; diff --git a/include/kfr/dft/cache.hpp b/include/kfr/dft/cache.hpp @@ -39,11 +39,12 @@ using dft_plan_ptr = std::shared_ptr<const dft_plan<T>>; template <typename T> using dft_plan_real_ptr = std::shared_ptr<const dft_plan_real<T>>; -struct dft_cache +template <int = 0> +struct dft_cache_impl { - static dft_cache& instance() + static dft_cache_impl& instance() { - static dft_cache cache; + static dft_cache_impl cache; return cache; } dft_plan_ptr<f32> get(ctype_t<f32>, size_t size) @@ -120,10 +121,12 @@ private: #endif }; +using dft_cache = dft_cache_impl<>; + template <typename T, size_t Tag> univector<complex<T>> dft(const univector<complex<T>, Tag>& input) { - dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, input.size()); + dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), input.size()); univector<complex<T>> output(input.size()); univector<u8> temp(dft->temp_size); dft->execute(output, input, temp); @@ -133,7 +136,7 @@ univector<complex<T>> dft(const univector<complex<T>, Tag>& input) template <typename T, size_t Tag> univector<complex<T>> idft(const univector<complex<T>, Tag>& input) { - dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, input.size()); + dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), input.size()); univector<complex<T>> output(input.size()); univector<u8> temp(dft->temp_size); dft->execute(output, input, temp, ctrue); @@ -143,7 +146,7 @@ univector<complex<T>> idft(const univector<complex<T>, Tag>& input) template <typename T, size_t Tag> univector<complex<T>> realdft(const univector<T, Tag>& input) { - dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype<T>, input.size()); + dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype_t<T>(), input.size()); univector<complex<T>> output(input.size() / 2 + 1); univector<u8> temp(dft->temp_size); dft->execute(output, input, temp); @@ -153,7 +156,7 @@ univector<complex<T>> realdft(const univector<T, Tag>& input) template <typename T, size_t Tag> univector<T> irealdft(const univector<complex<T>, Tag>& input) { - dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype<T>, input.size()); + dft_plan_real_ptr<T> dft = dft_cache::instance().getreal(ctype_t<T>(), input.size()); univector<T> output((input.size() - 1) * 2); univector<u8> temp(dft->temp_size); dft->execute(output, input, temp); diff --git a/include/kfr/dft/conv.hpp b/include/kfr/dft/conv.hpp @@ -34,9 +34,9 @@ #include "cache.hpp" #include "fft.hpp" -#pragma clang diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wshadow") -#pragma clang diagnostic ignored "-Wshadow" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") #endif namespace kfr @@ -51,7 +51,7 @@ KFR_INTRIN univector<T> convolve(const univector<T, Tag1>& src1, const univector src1padded.resize(size, 0); src2padded.resize(size, 0); - dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, size); + dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size); univector<u8> temp(dft->temp_size); dft->execute(src1padded, src1padded, temp); dft->execute(src2padded, src2padded, temp); @@ -69,7 +69,7 @@ KFR_INTRIN univector<T> correlate(const univector<T, Tag1>& src1, const univecto univector<complex<T>> src2padded = reverse(src2); src1padded.resize(size, 0); src2padded.resize(size, 0); - dft_plan_ptr<T> dft = dft_cache::instance().get(ctype<T>, size); + dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size); univector<u8> temp(dft->temp_size); dft->execute(src1padded, src1padded, temp); dft->execute(src2padded, src2padded, temp); @@ -87,4 +87,4 @@ KFR_INTRIN univector<T> autocorrelate(const univector<T, Tag1>& src) return result; } } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp @@ -35,11 +35,14 @@ #include "bitrev.hpp" #include "ft.hpp" -#pragma clang diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wshadow") -#pragma clang diagnostic ignored "-Wshadow" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") #endif +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4100)) + namespace kfr { @@ -65,9 +68,9 @@ protected: virtual void do_execute(complex<T>*, const complex<T>*, u8* temp) = 0; }; -#pragma clang diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wassume") -#pragma clang diagnostic ignored "-Wassume" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") #endif namespace internal @@ -75,15 +78,17 @@ namespace internal template <size_t width, bool inverse, typename T> KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>, - cvec<T, width> w, cvec<T, width> tw) + const cvec<T, width>& w, const cvec<T, width>& tw) { - cvec<T, width> b1 = w * dupeven(tw); - w = swap<2>(w); + cvec<T, width> ww = w; + cvec<T, width> tw_ = tw; + cvec<T, width> b1 = ww * dupeven(tw_); + ww = swap<2>(ww); if (inverse) - tw = -(tw); - w = subadd(b1, w * dupodd(tw)); - return w; + tw_ = -(tw_); + ww = subadd(b1, ww * dupodd(tw_)); + return ww; } template <size_t width, bool use_br2, bool inverse, bool aligned, typename T> @@ -107,9 +112,9 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfals cwrite<width, aligned>(out, sum02 + sum13); w2 = sum02 - sum13; - cwrite<width, aligned>( - out + N4 * (use_br2 ? 1 : 2), - radix4_apply_twiddle(csize<width>, cfalse, cbool<inverse>, w2, cread<width, true>(twiddle + width))); + cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2), + radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2, + cread<width, true>(twiddle + width))); diff02 = a0 - a2; diff13 = a1 - a3; if (inverse) @@ -125,17 +130,17 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfals w1 = diff02 + diff13; - cwrite<width, aligned>( - out + N4 * (use_br2 ? 2 : 1), - radix4_apply_twiddle(csize<width>, cfalse, cbool<inverse>, w1, cread<width, true>(twiddle + 0))); + cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1), + radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1, + cread<width, true>(twiddle + 0))); w3 = diff02 - diff13; - cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize<width>, cfalse, cbool<inverse>, w3, - cread<width, true>(twiddle + width * 2))); + cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), + w3, cread<width, true>(twiddle + width * 2))); } template <size_t width, bool inverse, typename T> KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>, - cvec<T, width> w, cvec<T, width> tw) + const cvec<T, width>& w, const cvec<T, width>& tw) { vec<T, width> re1, im1, twre, twim; split(w, re1, im1); @@ -144,10 +149,9 @@ KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_ const vec<T, width> b1re = re1 * twre; const vec<T, width> b1im = im1 * twre; if (inverse) - w = concat(b1re + im1 * twim, b1im - re1 * twim); + return concat(b1re + im1 * twim, b1im - re1 * twim); else - w = concat(b1re - im1 * twim, b1im + re1 * twim); - return w; + return concat(b1re - im1 * twim, b1im + re1 * twim); } template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T> @@ -175,8 +179,8 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im)); w2 = concat(sum02re - sum13re, sum02im - sum13im); cwrite_split<width, aligned, write_split>( - out + N4 * (use_br2 ? 1 : 2), - radix4_apply_twiddle(csize<width>, ctrue, cbool<inverse>, w2, cread<width, true>(twiddle + width))); + out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2, + cread<width, true>(twiddle + width))); const vec<T, width> diff02re = re0 - re2; const vec<T, width> diff02im = im0 - im2; @@ -187,11 +191,11 @@ KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); cwrite_split<width, aligned, write_split>( - out + N4 * (use_br2 ? 2 : 1), - radix4_apply_twiddle(csize<width>, ctrue, cbool<inverse>, w1, cread<width, true>(twiddle + 0))); - cwrite_split<width, aligned, write_split>(out + N4 * 3, - radix4_apply_twiddle(csize<width>, ctrue, cbool<inverse>, w3, - cread<width, true>(twiddle + width * 2))); + out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1, + cread<width, true>(twiddle + 0))); + cwrite_split<width, aligned, write_split>( + out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3, + cread<width, true>(twiddle + width * 2))); } template <typename T> @@ -254,30 +258,29 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s } } -template <typename T> -KFR_SINTRIN void prefetch_one(const complex<T>* in) -{ #ifdef CMT_ARCH_X86 - __builtin_prefetch(ptr_cast<void>(in), 0, _MM_HINT_T0); +#ifdef CMT_COMPILER_GNU +#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0); +#else +#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0); +#endif #else - __builtin_prefetch(ptr_cast<void>(in)); +#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr)); #endif + +template <typename T> +KFR_SINTRIN void prefetch_one(const complex<T>* in) +{ + KFR_PREFETCH(in); } template <typename T> KFR_SINTRIN void prefetch_four(size_t stride, const complex<T>* in) { -#ifdef CMT_ARCH_X86 - __builtin_prefetch(ptr_cast<void>(in), 0, _MM_HINT_T0); - __builtin_prefetch(ptr_cast<void>(in + stride), 0, _MM_HINT_T0); - __builtin_prefetch(ptr_cast<void>(in + stride * 2), 0, _MM_HINT_T0); - __builtin_prefetch(ptr_cast<void>(in + stride * 3), 0, _MM_HINT_T0); -#else - __builtin_prefetch(ptr_cast<void>(in)); - __builtin_prefetch(ptr_cast<void>(in + stride)); - __builtin_prefetch(ptr_cast<void>(in + stride * 2)); - __builtin_prefetch(ptr_cast<void>(in + stride * 3)); -#endif + KFR_PREFETCH(in); + KFR_PREFETCH(in + stride); + KFR_PREFETCH(in + stride * 2); + KFR_PREFETCH(in + stride * 3); } template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, @@ -287,20 +290,21 @@ KFR_SINTRIN cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t complex<T>* out, const complex<T>* in, const complex<T>*& twiddle) { constexpr static size_t prefetch_offset = width * 8; - const auto N4 = N / csize<4>; - const auto N43 = N4 * csize<3>; + const auto N4 = N / csize_t<4>(); + const auto N43 = N4 * csize_t<3>(); CMT_ASSUME(blocks > 0); CMT_ASSUME(N > 0); CMT_ASSUME(N4 > 0); CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++) { -#pragma clang loop unroll_count(2) + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) for (size_t n2 = 0; n2 < N4; n2 += width) { if (prefetch) prefetch_four(N4, in + prefetch_offset); - radix4_body(N, csize<width>, cbool < splitout || splitin >, cbool<splitout>, cbool<splitin>, - cbool<use_br2>, cbool<inverse>, cbool<aligned>, out, in, twiddle + n2 * 3); + radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(), + cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out, + in, twiddle + n2 * 3); in += width; out += width; } @@ -321,7 +325,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfal for (size_t b = 0; b < blocks; b++) { if (prefetch) - prefetch_four(csize<64>, out + prefetch_offset); + prefetch_four(csize_t<64>(), out + prefetch_offset); cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; split(cread<8, aligned>(out + 0), w0, w1); split(cread<8, aligned>(out + 8), w2, w3); @@ -330,13 +334,13 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfal butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); - w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>); - w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>); - w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>); - w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>); - w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>); - w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>); - w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>); + w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); + w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); + w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); + w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); + w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); + w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); + w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); cvec<T, 8> z0, z1, z2, z3; transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); @@ -380,7 +384,7 @@ KFR_SINTRIN ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfal { CMT_ASSUME(blocks > 0); constexpr static size_t prefetch_offset = width * 4; -#pragma clang loop unroll_count(2) + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) for (size_t b = 0; b < blocks; b += 2) { if (prefetch) @@ -434,13 +438,14 @@ struct fft_stage_impl : dft_stage<T> this->stage_size = stage_size; this->repeats = 4; this->recursion = true; - this->data_size = align_up(sizeof(complex<T>) * stage_size / 4 * 3, native_cache_alignment); + this->data_size = + align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); } protected: constexpr static bool prefetch = true; constexpr static bool aligned = false; - constexpr static size_t width = vector_width<T, cpu_t::native>; + constexpr static size_t width = platform<T>::vector_width; virtual void do_initialize(size_t size) override final { @@ -452,12 +457,12 @@ protected: { const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); if (splitin) - in = out; - const size_t stage_size = this->stage_size; - CMT_ASSUME(stage_size >= 2048); - CMT_ASSUME(stage_size % 2048 == 0); - radix4_pass(stage_size, 1, csize<width>, ctrue, cbool<splitin>, cbool<!is_even>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, in, twiddle); + in = out; + const size_t stg_size = this->stage_size; + CMT_ASSUME(stg_size >= 2048); + CMT_ASSUME(stg_size % 2048 == 0); + radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); } }; @@ -470,11 +475,11 @@ struct fft_final_stage_impl : dft_stage<T> this->out_offset = size; this->repeats = 4; this->recursion = true; - this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, native_cache_alignment); + this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment); } protected: - constexpr static size_t width = vector_width<T, cpu_t::native>; + constexpr static size_t width = platform<T>::vector_width; constexpr static bool is_even = cometa::is_even(ilog2(size)); constexpr static bool use_br2 = !is_even; constexpr static bool aligned = false; @@ -483,11 +488,11 @@ protected: virtual void do_initialize(size_t total_size) override final { complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - size_t stage_size = this->stage_size; - while (stage_size > 4) + size_t stg_size = this->stage_size; + while (stg_size > 4) { - initialize_twiddles<T, width>(twiddle, stage_size, total_size, true); - stage_size /= 4; + initialize_twiddles<T, width>(twiddle, stg_size, total_size, true); + stg_size /= 4; } } @@ -496,55 +501,55 @@ protected: constexpr bool is_double = sizeof(T) == 8; constexpr size_t final_size = is_even ? (is_double ? 4 : 16) : (is_double ? 8 : 32); const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - final_pass(csize<final_size>, out, in, twiddle); + final_pass(csize_t<final_size>(), out, in, twiddle); } KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(512, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, in, twiddle); - radix4_pass(128, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(32, 16, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(csize<8>, 64, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(512, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(128, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(32, 16, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<8>(), 64, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(512, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, in, twiddle); - radix4_pass(128, 4, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(csize<32>, 16, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(512, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(128, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<32>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } KFR_INTRIN void final_pass(csize_t<4>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(1024, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, in, twiddle); - radix4_pass(256, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(64, 16, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(16, 64, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(csize<4>, 256, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(1024, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(256, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(64, 16, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(16, 64, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<4>(), 256, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } KFR_INTRIN void final_pass(csize_t<16>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(1024, 1, csize<width>, ctrue, cbool<splitin>, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, in, twiddle); - radix4_pass(256, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(64, 16, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(csize<16>, 64, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(1024, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(256, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(64, 16, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<16>(), 64, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } }; @@ -565,7 +570,7 @@ protected: virtual void do_execute(complex<T>* out, const complex<T>*, u8* /*temp*/) override final { - fft_reorder(out, log2n, cbool<!is_even>); + fft_reorder(out, log2n, cbool_t<!is_even>()); } }; @@ -596,7 +601,7 @@ protected: { cvec<T, 1> a0, a1, a2, a3; split(cread<4>(in), a0, a1, a2, a3); - butterfly(cbool<inverse>, a0, a1, a2, a3, a0, a1, a2, a3); + butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3); cwrite<4>(out, concat(a0, a1, a2, a3)); } }; @@ -651,7 +656,7 @@ protected: constexpr static bool aligned = false; virtual void do_execute(complex<T>* out, const complex<T>* in, u8*) override final { - butterfly64(cbool<inverse>, cbool<aligned>, out, in); + butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in); } }; @@ -661,12 +666,12 @@ struct fft_specialization<T, 7, inverse> : dft_stage<T> fft_specialization(size_t) { this->stage_size = 128; - this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, native_cache_alignment); + this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); } protected: constexpr static bool aligned = false; - constexpr static size_t width = vector_width<T, cpu_t::native>; + constexpr static size_t width = platform<T>::vector_width; constexpr static bool use_br2 = true; constexpr static bool prefetch = false; constexpr static bool is_double = sizeof(T) == 8; @@ -684,26 +689,26 @@ protected: virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override final { const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - final_pass(csize<final_size>, out, in, twiddle); - fft_reorder(out, csize<7>); + final_pass(csize_t<final_size>(), out, in, twiddle); + fft_reorder(out, csize_t<7>()); } KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(128, 1, csize<width>, ctrue, cfalse, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, in, twiddle); - radix4_pass(32, 4, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(csize<8>, 16, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(128, 1, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, in, twiddle); - radix4_pass(csize<32>, 4, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } }; @@ -749,12 +754,12 @@ struct fft_specialization<double, 8, inverse> : dft_stage<double> fft_specialization(size_t) { this->stage_size = 256; - this->data_size = align_up(sizeof(complex<T>) * 256 * 3 / 2, native_cache_alignment); + this->data_size = align_up(sizeof(complex<T>) * 256 * 3 / 2, platform<>::native_cache_alignment); } protected: constexpr static bool aligned = false; - constexpr static size_t width = vector_width<T, cpu_t::native>; + constexpr static size_t width = platform<T>::vector_width; constexpr static bool use_br2 = false; constexpr static bool prefetch = false; constexpr static size_t split_format = true; @@ -770,20 +775,20 @@ protected: virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override final { const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - final_pass(csize<4>, out, in, twiddle); - fft_reorder(out, csize<8>); + final_pass(csize_t<4>(), out, in, twiddle); + fft_reorder(out, csize_t<8>()); } KFR_INTRIN void final_pass(csize_t<4>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle) { - radix4_pass(256, 1, csize<width>, ctrue, cfalse, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, in, twiddle); - radix4_pass(64, 4, csize<width>, ctrue, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(16, 16, csize<width>, cfalse, ctrue, cbool<use_br2>, cbool<prefetch>, cbool<inverse>, - cbool<aligned>, out, out, twiddle); - radix4_pass(csize<4>, 64, csize<width>, cfalse, cfalse, cbool<use_br2>, cbool<prefetch>, - cbool<inverse>, cbool<aligned>, out, out, twiddle); + radix4_pass(256, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); + radix4_pass(64, 4, csize_t<width>(), ctrue, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(16, 16, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<4>(), 64, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } }; @@ -835,15 +840,16 @@ struct dft_plan if (is_poweroftwo(size)) { const size_t log2n = ilog2(size); - cswitch(csizes<1, 2, 3, 4, 5, 6, 7, 8>, log2n, + cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8>(), log2n, [&](auto log2n) { - add_stage<internal::fft_specialization_t<T, val_of(decltype(log2n)()), - false>::template type>(size, type); + (void)log2n; + this->add_stage<internal::fft_specialization_t<T, val_of(decltype(log2n)()), + false>::template type>(size, type); }, [&]() { cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { - make_fft(size, type, is_even, ctrue); - add_stage<internal::fft_reorder_stage_impl_t< + this->make_fft(size, type, is_even, ctrue); + this->add_stage<internal::fft_reorder_stage_impl_t< T, val_of(decltype(is_even)())>::template type>(size, type); }); }); @@ -927,7 +933,7 @@ protected: { add_stage<fft_stage_impl_t::template type>(stage_size, type); - make_fft(stage_size / 4, cbools<direct, inverse>, cbool<is_even>, cfalse); + make_fft(stage_size / 4, cbools_t<direct, inverse>(), cbool_t<is_even>(), cfalse); } else { @@ -1021,15 +1027,13 @@ struct dft_plan_real : dft_plan<T> : dft_plan<T>(size / 2, type), rtwiddle(size / 4) { using namespace internal; - const size_t csize = size / 2; - const size_t count = size / 4; - constexpr size_t width = vector_width<T> * 2; + constexpr size_t width = platform<T>::vector_width * 2; - block_process(count, csizes<width, 1>, [=](size_t i, auto w) { + block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) { constexpr size_t width = val_of(decltype(w)()); cwrite<width>(rtwiddle.data() + i, - cossin(dup(-c_pi<T> * ((enumerate<T, width>() + i + count) / csize)))); + cossin(dup(-c_pi<T> * ((enumerate<T, width>() + i + size / 4) / (size / 2))))); }); } @@ -1069,13 +1073,13 @@ private: void to_fmt(complex<T>* out, dft_pack_format fmt) const { using namespace internal; - const size_t csize = this->size; // / 2; + size_t csize = this->size; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2 - constexpr size_t width = vector_width<T> * 2; + constexpr size_t width = platform<T>::vector_width * 2; const cvec<T, 1> dc = cread<1>(out); const size_t count = csize / 2; - block_process(count, csizes<width, 1>, [=](size_t i, auto w) { + block_process(count, csizes_t<width, 1>(), [=](size_t i, auto w) { constexpr size_t width = val_of(decltype(w)()); constexpr size_t widthm1 = width - 1; const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); @@ -1122,10 +1126,10 @@ private: dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag()); } - constexpr size_t width = vector_width<T> * 2; + constexpr size_t width = platform<T>::vector_width * 2; const size_t count = csize / 2; - block_process(count, csizes<width, 1>, [=](size_t i, auto w) { + block_process(count, csizes_t<width, 1>(), [=](size_t i, auto w) { constexpr size_t width = val_of(decltype(w)()); constexpr size_t widthm1 = width - 1; const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); @@ -1150,4 +1154,6 @@ private: }; } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp @@ -37,6 +37,9 @@ #include "../base/memory.hpp" #include "../data/sincos.hpp" +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4127)) + namespace kfr { @@ -44,18 +47,18 @@ namespace internal { template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, N> y) +CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y) { return subadd(x * dupeven(y), swap<2>(x) * dupodd(y)); } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_impl(vec<T, N> x, vec<T, 2> y) +CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y) { vec<T, N> yy = resize<N>(y); return cmul_impl(x, yy); } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y) +CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) { vec<T, N> xx = resize<N>(x); return cmul_impl(xx, y); @@ -63,23 +66,24 @@ CMT_INLINE vec<T, N> cmul_impl(vec<T, 2> x, vec<T, N> y) /// Complex Multiplication template <typename T, size_t N1, size_t N2> -CMT_INLINE vec<T, const_max(N1, N2)> cmul(vec<T, N1> x, vec<T, N2> y) +CMT_INLINE vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y) { return internal::cmul_impl(x, y); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, N> y) +CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y) { return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE vec<T, N> cmul_2conj(vec<T, N> in0, vec<T, N> in1, vec<T, N> tw) +CMT_INLINE vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw) { return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec<T, 2> in1, vec<T, N> tw) +CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1, + const vec<T, N>& tw) { const vec<T, N> twr = dupeven(tw); const vec<T, N> twi = dupodd(tw); @@ -91,13 +95,13 @@ CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, vec<T, 2> in0, vec< out1 += sumtw - diftw; } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_conj(vec<T, N> x, vec<T, 2> y) +CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y) { vec<T, N> yy = resize<N>(y); return cmul_conj(x, yy); } template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -CMT_INLINE vec<T, N> cmul_conj(vec<T, 2> x, vec<T, N> y) +CMT_INLINE vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y) { vec<T, N> xx = resize<N>(x); return cmul_conj(xx, y); @@ -113,7 +117,7 @@ CMT_INLINE cvec<T, N> cread(const complex<T>* src) } template <size_t N, bool A = false, typename T> -CMT_INLINE void cwrite(complex<T>* dest, cvec<T, N> value) +CMT_INLINE void cwrite(complex<T>* dest, const cvec<T, N>& value) { return simd_write<A, N * 2>(ptr_cast<T>(dest), *value); } @@ -124,7 +128,7 @@ CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<i return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...); } template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> -CMT_INLINE void cwrite_group_impl(complex<T>* dest, cvec<T, count * N> value, csizes_t<indices...>) +CMT_INLINE void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>) { swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... }; } @@ -135,7 +139,7 @@ CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t str return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...); } template <size_t count, size_t N, bool A, typename T, size_t... indices> -CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, cvec<T, count * N> value, +CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value, csizes_t<indices...>) { swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... }; @@ -144,25 +148,25 @@ CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, cvec<T, count template <size_t count, size_t N, size_t stride, bool A = false, typename T> CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src) { - return cread_group_impl<count, N, stride, A>(src, csizeseq<count>); + return cread_group_impl<count, N, stride, A>(src, csizeseq_t<count>()); } template <size_t count, size_t N, size_t stride, bool A = false, typename T> -CMT_INLINE void cwrite_group(complex<T>* dest, cvec<T, count * N> value) +CMT_INLINE void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value) { - return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq<count>); + return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq_t<count>()); } template <size_t count, size_t N, bool A = false, typename T> CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride) { - return cread_group_impl<count, N, A>(src, stride, csizeseq<count>); + return cread_group_impl<count, N, A>(src, stride, csizeseq_t<count>()); } template <size_t count, size_t N, bool A = false, typename T> -CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, cvec<T, count * N> value) +CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value) { - return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq<count>); + return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq_t<count>()); } template <size_t N, bool A = false, bool split = false, typename T> @@ -175,11 +179,12 @@ CMT_INLINE cvec<T, N> cread_split(const complex<T>* src) } template <size_t N, bool A = false, bool split = false, typename T> -CMT_INLINE void cwrite_split(complex<T>* dest, cvec<T, N> value) +CMT_INLINE void cwrite_split(complex<T>* dest, const cvec<T, N>& value) { + cvec<T, N> v = value; if (split) - value = interleavehalfs(value); - simd_write<A, N * 2>(ptr_cast<T>(dest), *value); + v = interleavehalfs(v); + simd_write<A, N * 2>(ptr_cast<T>(dest), *v); } template <> @@ -209,47 +214,51 @@ inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src) } template <> -inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, cvec<f32, 8> x) +inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) { - x = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); + const cvec<f32, 8> xx = + concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); cvec<f32, 2> a, b, c, d; - split(x, a, b, c, d); + split(xx, a, b, c, d); cwrite<2>(dest, a); cwrite<2>(dest + 4, b); cwrite<2>(dest + 2, c); cwrite<2>(dest + 6, d); } template <> -inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, cvec<f32, 8> x) +inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) { - x = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); + const cvec<f32, 8> xx = + concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); cvec<f32, 2> a, b, c, d; - split(x, a, b, c, d); - cwrite<2, true>(dest, a); + split(xx, a, b, c, d); + cwrite<2, true>(dest + 0, a); cwrite<2, true>(dest + 4, b); cwrite<2, true>(dest + 2, c); cwrite<2, true>(dest + 6, d); } template <> -inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, cvec<f64, 4> x) +inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) { - x = concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); - cwrite<1>(dest, part<4, 0>(x)); - cwrite<1>(dest + 2, part<4, 1>(x)); - cwrite<1>(dest + 1, part<4, 2>(x)); - cwrite<1>(dest + 3, part<4, 3>(x)); + const cvec<f64, 4> xx = + concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); + cwrite<1>(dest, part<4, 0>(xx)); + cwrite<1>(dest + 2, part<4, 1>(xx)); + cwrite<1>(dest + 1, part<4, 2>(xx)); + cwrite<1>(dest + 3, part<4, 3>(xx)); } template <> -inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, cvec<f64, 4> x) +inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) { - x = concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); - cwrite<1, true>(dest, part<4, 0>(x)); - cwrite<1, true>(dest + 2, part<4, 1>(x)); - cwrite<1, true>(dest + 1, part<4, 2>(x)); - cwrite<1, true>(dest + 3, part<4, 3>(x)); + const cvec<f64, 4> xx = + concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); + cwrite<1, true>(dest + 0, part<4, 0>(xx)); + cwrite<1, true>(dest + 2, part<4, 1>(xx)); + cwrite<1, true>(dest + 1, part<4, 2>(xx)); + cwrite<1, true>(dest + 3, part<4, 3>(xx)); } template <size_t N, size_t stride, typename T, size_t... Indices> @@ -266,7 +275,7 @@ CMT_INLINE cvec<T, N> cgather(const complex<T>* base) return ref_cast<cvec<T, N>>(*base); } else - return cgather_helper<N, stride, T>(base, csizeseq<N>); + return cgather_helper<N, stride, T>(base, csizeseq_t<N>()); } CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) @@ -294,13 +303,13 @@ CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size template <size_t N, typename T> CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride) { - return cgather_helper<N, T>(base, index, stride, csizeseq<N>); + return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); } template <size_t N, typename T> CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride) { size_t index = 0; - return cgather_helper<N, T>(base, index, stride, csizeseq<N>); + return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); } template <size_t N, typename T, size_t... Indices> @@ -313,17 +322,17 @@ CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size template <size_t N, typename T> CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size) { - return cgather_helper<N, T>(base, index, stride, size, csizeseq<N>); + return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>()); } template <size_t N, size_t stride, typename T, size_t... Indices> -CMT_INLINE void cscatter_helper(complex<T>* base, cvec<T, N> value, csizes_t<Indices...>) +CMT_INLINE void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>) { swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; } template <size_t N, size_t stride, typename T> -CMT_INLINE void cscatter(complex<T>* base, cvec<T, N> value) +CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value) { if (stride == 1) { @@ -331,38 +340,39 @@ CMT_INLINE void cscatter(complex<T>* base, cvec<T, N> value) } else { - return cscatter_helper<N, stride, T>(base, value, csizeseq<N>); + return cscatter_helper<N, stride, T>(base, value, csizeseq_t<N>()); } } template <size_t N, typename T, size_t... Indices> -CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, cvec<T, N> value, csizes_t<Indices...>) +CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value, + csizes_t<Indices...>) { swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; } template <size_t N, typename T> -CMT_INLINE void cscatter(complex<T>* base, size_t stride, cvec<T, N> value) +CMT_INLINE void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value) { - return cscatter_helper<N, T>(base, stride, value, csizeseq<N>); + return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>()); } template <size_t groupsize = 1, typename T, size_t N, typename IT> -CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, vec<IT, N> offset) +CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset) { - return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq<N>); + return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>()); } template <size_t groupsize = 1, typename T, size_t N, typename IT> -CMT_INLINE void cscatter(complex<T>* base, vec<IT, N> offset, vec<T, N * 2 * groupsize> value) +CMT_INLINE void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value) { - return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq<N>); + return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>()); } template <typename T> -KFR_INTRIN void transpose4x8(cvec<T, 8> z0, cvec<T, 8> z1, cvec<T, 8> z2, cvec<T, 8> z3, cvec<T, 4>& w0, - cvec<T, 4>& w1, cvec<T, 4>& w2, cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, - cvec<T, 4>& w6, cvec<T, 4>& w7) +KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2, + const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2, + cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, cvec<T, 4>& w7) { cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3)); cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3)); @@ -379,8 +389,9 @@ KFR_INTRIN void transpose4x8(cvec<T, 8> z0, cvec<T, 8> z1, cvec<T, 8> z2, cvec<T } template <typename T> -KFR_INTRIN void transpose4x8(cvec<T, 4> w0, cvec<T, 4> w1, cvec<T, 4> w2, cvec<T, 4> w3, cvec<T, 4> w4, - cvec<T, 4> w5, cvec<T, 4> w6, cvec<T, 4> w7, cvec<T, 8>& z0, cvec<T, 8>& z1, +KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2, + const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5, + const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1, cvec<T, 8>& z2, cvec<T, 8>& z3) { cvec<T, 16> a = concat(w0, w1, w2, w3); @@ -453,20 +464,20 @@ constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices... : cos_using_table<T>(size, indices / 2 * step + start))...); } -template <typename T, size_t width, size_t size, size_t start, size_t step, bool inverse = false> -constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle() +template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false> +constexpr KFR_INTRIN cvec<T, width> fixed_twiddle() { - return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq<width * 2>); + return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>()); } template <typename T, size_t width> -constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle(size_t size, size_t start, size_t step = 0) +constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0) { - return get_fixed_twiddle_helper<T, width>(csizeseq<width * 2>, start, step, size); + return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size); } -template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false> -constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>(); +// template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false> +// constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>(); template <typename T, size_t N, bool inverse> constexpr cvec<T, N> twiddleimagmask() @@ -474,19 +485,19 @@ constexpr cvec<T, N> twiddleimagmask() return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1); } -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wconversion" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wconversion") -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) template <typename T, size_t N> -CMT_NOINLINE static vec<T, N> cossin_conj(vec<T, N> x) +CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x) { return cconj(cossin(x)); } template <size_t k, size_t size, bool inverse = false, typename T, size_t width> -KFR_INTRIN vec<T, width> cmul_by_twiddle(vec<T, width> x) +KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x) { constexpr size_t kk = (inverse ? size - k : k) % size; constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485); @@ -524,12 +535,12 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(vec<T, width> x) } else { - return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>)); + return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>())); } } template <size_t N, typename T> -KFR_INTRIN void butterfly2(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N>& w0, cvec<T, N>& w1) +KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) { w0 = a0 + a1; w1 = a0 - a1; @@ -542,8 +553,9 @@ KFR_INTRIN void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1) } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, - cvec<T, N> a3, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) +KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, + const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3) { cvec<T, N> sum02, sum13, diff02, diff13; cvec<T, N * 2> a01, a23, sum0213, diff0213; @@ -575,8 +587,9 @@ KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, cvec<T, N> a0, cvec<T, N> } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, - cvec<T, N> a3, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) +KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, + const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3) { vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3; vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3; @@ -601,15 +614,16 @@ KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, cvec<T, N> a0, cvec<T, N> a } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly8(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> a4, - cvec<T, N> a5, cvec<T, N> a6, cvec<T, N> a7, cvec<T, N>& w0, cvec<T, N>& w1, +KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7) { cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6; - butterfly4<N, inverse>(cbool<false>, b0, b2, b4, b6, b0, b2, b4, b6); + butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6); cvec<T, N> b1 = a1, b3 = a3, b5 = a5, b7 = a7; - butterfly4<N, inverse>(cbool<false>, b1, b3, b5, b7, b1, b3, b5, b7); + butterfly4<N, inverse>(cfalse, b1, b3, b5, b7, b1, b3, b5, b7); w0 = b0 + b1; w4 = b0 - b1; @@ -637,7 +651,7 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv { cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67; - butterfly4<2, inverse>(cbool<false>, b01, b23, b45, b67, b01, b23, b45, b67); + butterfly4<2, inverse>(cfalse, b01, b23, b45, b67, b01, b23, b45, b67); cvec<T, 2> b02, b13, b46, b57; @@ -645,8 +659,8 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv cvec<T, 8> b02461357 = concat(even<2>(b01234567), odd<2>(b01234567)); split(b02461357, b02, b46, b13, b57); - b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>); - b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>); + b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>()); + b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>()); a01 = b02 + b13; a23 = b46 + b57; a45 = b02 - b13; @@ -669,13 +683,13 @@ KFR_INTRIN void butterfly32(cvec<T, 32>& v32) split(v32, w0, w1, w2, w3, w4, w5, w6, w7); butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); - w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>); - w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>); - w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>); - w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>); - w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>); - w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>); - w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>); + w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); + w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); + w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); + w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); + w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); + w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); + w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); cvec<T, 8> z0, z1, z2, z3; transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); @@ -707,7 +721,7 @@ KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01) } template <size_t N, bool inverse = false, bool split_format = false, typename T> -KFR_INTRIN void apply_twiddle(cvec<T, N> a1, cvec<T, N> tw1, cvec<T, N>& w1) +KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) { if (split_format) { @@ -723,18 +737,20 @@ KFR_INTRIN void apply_twiddle(cvec<T, N> a1, cvec<T, N> tw1, cvec<T, N>& w1) } else { - cvec<T, N> b1 = a1 * dupeven(tw1); - a1 = swap<2>(a1); + const cvec<T, N> b1 = a1 * dupeven(tw1); + const cvec<T, N> a1_ = swap<2>(a1); + cvec<T, N> tw1_ = tw1; if (inverse) - tw1 = -(tw1); - w1 = subadd(b1, a1 * dupodd(tw1)); + tw1_ = -(tw1_); + w1 = subadd(b1, a1_ * dupodd(tw1_)); } } template <size_t N, bool inverse = false, bool split_format = false, typename T> -KFR_INTRIN void apply_twiddles4(cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> tw1, cvec<T, N> tw2, - cvec<T, N> tw3, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) +KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3, + const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3, + cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) { apply_twiddle<N, inverse, split_format>(a1, tw1, w1); apply_twiddle<N, inverse, split_format>(a2, tw2, w2); @@ -743,14 +759,16 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cve template <size_t N, bool inverse = false, typename T> KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3) + cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2, + const cvec<T, N>& tw3) { apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3); } template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]> KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, cvec<T, 1> tw1, cvec<T, 1> tw2, cvec<T, 1> tw3) + cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2, + const cvec<T, 1>& tw3) { apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); } @@ -800,9 +818,9 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) cvec<T, N> a3; split(a0123, a0, a1, a2, a3); - cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>, - tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>, - tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>; + cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>(), + tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>(), + tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>(); apply_twiddles4<N>(a1, a2, a3, tw1, tw2, tw3); @@ -984,23 +1002,23 @@ KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) w3 = digitreverse4<2>(w3); transpose4(w0, w1, w2, w3); - cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>)); - cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>)); - cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>)); - cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>)); + cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>())); + cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>())); + cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>())); + cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>())); } template <size_t n2, size_t nnstep, size_t N, typename T> KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1) { - cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>; + cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>(); a1 = cmul(a1, tw1); } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01, - cvec<T, N>& w02) +KFR_INTRIN void butterfly3(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, + cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02) { constexpr cvec<T, N> tw3r1 = static_cast<T>(-0.5); constexpr cvec<T, N> tw3i1 = @@ -1025,9 +1043,9 @@ KFR_INTRIN void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2) } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly6(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> a4, - cvec<T, N> a5, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, - cvec<T, N>& w4, cvec<T, N>& w5) +KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, cvec<T, N>& w0, + cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5) { cvec<T, N* 2> a03 = concat(a0, a3); cvec<T, N* 2> a25 = concat(a2, a5); @@ -1056,8 +1074,9 @@ KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec< } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, - cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, +KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, + const cvec<T, N>& a03, const cvec<T, N>& a04, const cvec<T, N>& a05, + const cvec<T, N>& a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06) { constexpr cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884); @@ -1102,9 +1121,9 @@ KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec< } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly5(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, - cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, cvec<T, N>& w03, - cvec<T, N>& w04) +KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, + const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, + cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) { constexpr cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718); constexpr cvec<T, N> tw5i1 = @@ -1132,10 +1151,12 @@ KFR_INTRIN void butterfly5(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec< } template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly10(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, cvec<T, N> a4, - cvec<T, N> a5, cvec<T, N> a6, cvec<T, N> a7, cvec<T, N> a8, cvec<T, N> a9, - cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, - cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8, cvec<T, N>& w9) +KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, + const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, + cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, + cvec<T, N>& w8, cvec<T, N>& w9) { cvec<T, N* 2> a05 = concat(a0, a5); cvec<T, N* 2> a27 = concat(a2, a7); @@ -1170,64 +1191,69 @@ KFR_INTRIN void butterfly10(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N>& out0, vec<T, N>& out1) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0, + vec<T, N>& out1) { butterfly2<N / 2>(in0, in1, out0, out1); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N>& out0, - vec<T, N>& out1, vec<T, N>& out2) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2) { butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3, - vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3) { butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3, - vec<T, N> in4, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, - vec<T, N>& out4) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4) { butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3, - vec<T, N> in4, vec<T, N> in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, - vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, vec<T, N>& out0, + vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) { butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3, - vec<T, N> in4, vec<T, N> in5, vec<T, N> in6, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, + const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6) { butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3, - vec<T, N> in4, vec<T, N> in5, vec<T, N> in6, vec<T, N> in7, vec<T, N>& out0, - vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, - vec<T, N>& out6, vec<T, N>& out7) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, + const vec<T, N>& in6, const vec<T, N>& in7, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, + vec<T, N>& out7) { butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, out6, out7); } template <bool inverse, typename T, size_t N> -KFR_INTRIN void butterfly(cbool_t<inverse>, vec<T, N> in0, vec<T, N> in1, vec<T, N> in2, vec<T, N> in3, - vec<T, N> in4, vec<T, N> in5, vec<T, N> in6, vec<T, N> in7, vec<T, N> in8, - vec<T, N> in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, - vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, - vec<T, N>& out9) +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, + const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, + const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, + vec<T, N>& out8, vec<T, N>& out9) { butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9); } -template <bool transposed, typename T, size_t... N, size_t Nout = csum(csizes<N...>)> +template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) { vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr)); @@ -1255,7 +1281,7 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3 w4 = cgather<4, 5>(ptr + 4); } -template <bool transposed, typename T, size_t... N, size_t Nout = csum(csizes<N...>)> +template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) { auto temp = concat(args...); @@ -1265,12 +1291,12 @@ KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N } template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> -KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, vec<T, N> x, const complex<T>* twiddle) +KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle) { return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1))); } template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> -KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, vec<T, N> x, const complex<T>* twiddle) +KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle) { return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1))); } @@ -1282,13 +1308,14 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize { carray<cvec<T, width>, radix> inout; - swallow{ (inout.get(csize<I>) = cread<width>(in + i + stride * I))... }; + swallow{ (inout.get(csize_t<I>()) = cread<width>(in + i + stride * I))... }; - butterfly(cbool<inverse>, inout.template get<I>()..., inout.template get<I>()...); + butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); - swallow{ (cwrite<width>(out + i + stride * I, - mul_tw<I, radix>(cbool<inverse>, inout.template get<I>(), tw + i * (radix - 1))), - 0)... }; + swallow{ ( + cwrite<width>(out + i + stride * I, + mul_tw<I, radix>(cbool_t<inverse>(), inout.template get<I>(), tw + i * (radix - 1))), + 0)... }; } // Final @@ -1299,17 +1326,17 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize carray<cvec<T, width>, radix> inout; // swallow{ ( inout.get( csize<I> ) = infn( i, I, cvec<T, width>( ) ) )... }; - cread_transposed(cbool<true>, in + i * radix, inout.template get<I>()...); + cread_transposed(ctrue, in + i * radix, inout.template get<I>()...); - butterfly(cbool<inverse>, inout.template get<I>()..., inout.template get<I>()...); + butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); - swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize<I>)), 0)... }; + swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize_t<I>())), 0)... }; } template <size_t width, size_t radix, typename... Args> KFR_INTRIN void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args) { - butterfly_helper(csizeseq<radix>, i, csize<width>, csize<radix>, std::forward<Args>(args)...); + butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...); } template <typename... Args> @@ -1321,8 +1348,8 @@ KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&. { CMT_LOOP_NOUNROLL for (; i < count / width * width; i += width) - butterfly(i, csize<width>, std::forward<Args>(args)...); - butterfly_cycle(i, count, csize<width / 2>, std::forward<Args>(args)...); + butterfly(i, csize_t<width>(), std::forward<Args>(args)...); + butterfly_cycle(i, count, csize_t<width / 2>(), std::forward<Args>(args)...); } template <size_t width, typename... Args> @@ -1330,7 +1357,7 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args) { CMT_ASSUME(count > 0); size_t i = 0; - butterfly_cycle(i, count, csize<width>, std::forward<Args>(args)...); + butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...); } template <typename T, bool inverse, typename Tstride> @@ -1376,7 +1403,7 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<in reverse<2>(sum1)); } } - generic_butterfly_cycle(csize<width / 2>, radix, cbool<inverse>, out, in, ostride, halfradix, + generic_butterfly_cycle(csize_t<width / 2>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, halfradix_sqr, twiddle, i); } @@ -1406,8 +1433,8 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* CMT_ASSUME(halfradix > 0); size_t i = 0; - generic_butterfly_cycle(csize<width>, radix, cbool<inverse>, out, in, ostride, halfradix, halfradix_sqr, - twiddle, i); + generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, + halfradix_sqr, twiddle, i); } template <typename T, bool inverse, typename Tstride = csize_t<1>> @@ -1419,14 +1446,14 @@ KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* ou builtin_memcpy(temp, in, sizeof(complex<T>) * radix); in = temp; } - constexpr size_t width = vector_width<T, cpu_t::native>; + constexpr size_t width = platform<T>::vector_width; - cswitch(csizes<11>, radix, + cswitch(csizes_t<11>(), radix, [&](auto radix_) CMT_INLINE_LAMBDA { - generic_butterfly_w<width>(decltype(radix_)(), cbool<inverse>, out, in, twiddle, ostride); + generic_butterfly_w<width>(decltype(radix_)(), cbool_t<inverse>(), out, in, twiddle, ostride); }, [&]() CMT_INLINE_LAMBDA { - generic_butterfly_w<width>(radix, cbool<inverse>, out, in, twiddle, ostride); + generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); }); } @@ -1437,13 +1464,13 @@ template <typename T, size_t N> constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T()); template <bool A = false, typename T, size_t N> -KFR_INTRIN void cbitreverse_write(complex<T>* dest, vec<T, N> x) +KFR_INTRIN void cbitreverse_write(complex<T>* dest, const vec<T, N>& x) { cwrite<N / 2, A>(dest, bitreverse<2>(x)); } template <bool A = false, typename T, size_t N> -KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, vec<T, N> x) +KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x) { cwrite<N / 2, A>(dest, digitreverse4<2>(x)); } @@ -1471,7 +1498,7 @@ KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15)); } template <> -KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, vec<f64, 32> x) +KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x) { cwrite<1>(dest, part<16, 0>(x)); cwrite<1>(dest + 4, part<16, 1>(x)); @@ -1496,3 +1523,5 @@ KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, vec<f64 #endif } } + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp @@ -108,7 +108,7 @@ struct biquad_block vec<T, filters> b2; constexpr biquad_block() noexcept : a1(0), a2(0), b0(1), b1(0), b2(0) {} - constexpr biquad_block(const biquad_params<T>* bq, size_t count) noexcept + CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) noexcept { count = count > filters ? filters : count; for (size_t i = 0; i < count; i++) @@ -161,7 +161,7 @@ struct expression_biquads : public expression<E1> return out; } KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state, - vec<T, filters> in) + const vec<T, filters>& in) { const vec<T, filters> out = bq.b0 * in + state.s1; state.s1 = state.s2 + bq.b1 * in - bq.a1 * out; @@ -301,7 +301,7 @@ CMT_INLINE internal::expression_biquads_zl<filters, T, internal::arg<E1>> biquad template <typename T, typename E1> CMT_INLINE expression_pointer<T> biquad_zl(const biquad_params<T>* bq, size_t count, E1&& e1) { - return cswitch(csizes<1, 2, 4, 8, 16, 32, 64>, next_poweroftwo(count), + return cswitch(csizes_t<1, 2, 4, 8, 16, 32, 64>(), next_poweroftwo(count), [&](auto x) { constexpr size_t filters = x; return to_pointer(internal::expression_biquads_zl<filters, T, internal::arg<E1>>( diff --git a/include/kfr/dsp/delay.hpp b/include/kfr/dsp/delay.hpp @@ -103,7 +103,7 @@ struct expression_delay<1, E> : expression<E> * @endcode */ template <size_t samples = 1, typename E1> -CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize<samples>) +CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>()) { static_assert(samples >= 1 && samples < 1024, ""); return internal::expression_delay<samples, E1>(std::forward<E1>(e1)); diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp @@ -61,7 +61,7 @@ struct expression_short_fir : expression<E1> vec<T, N> in = this->argument_first(cinput, index, x); vec<T, N> out = in * taps[0]; - cfor(csize<1>, csize<tapcount>, + cfor(csize_t<1>(), csize_t<tapcount>(), [&](auto I) { out = out + concat_and_slice<tapcount - 1 - I, N>(delayline, in) * taps[I]; }); delayline = concat_and_slice<N, tapcount - 1>(delayline, in); diff --git a/include/kfr/dsp/mixdown.hpp b/include/kfr/dsp/mixdown.hpp @@ -45,7 +45,7 @@ struct stereo_matrix template <typename T, size_t N> CMT_INLINE vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const { - return process(x, csizeseq<N>); + return process(x, csizeseq_t<N>()); } template <typename T, size_t N, size_t... indices> CMT_INLINE vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const @@ -56,8 +56,16 @@ struct stereo_matrix }; } -constexpr f64x2x2 matrix_sum_diff() { return { f64x2{ 1, 1 }, f64x2{ 1, -1 } }; } -constexpr f64x2x2 matrix_halfsum_halfdiff() { return { f64x2{ 0.5, 0.5 }, f64x2{ 0.5, -0.5 } }; } +template <int = 0> +CMT_GNU_CONSTEXPR f64x2x2 matrix_sum_diff() +{ + return { f64x2{ 1, 1 }, f64x2{ 1, -1 } }; +} +template <int = 0> +CMT_GNU_CONSTEXPR f64x2x2 matrix_halfsum_halfdiff() +{ + return { f64x2{ 0.5, 0.5 }, f64x2{ 0.5, -0.5 } }; +} /** * @brief Returns template expression that returns the vector of length 2 containing mix of the left and right diff --git a/include/kfr/dsp/oscillators.hpp b/include/kfr/dsp/oscillators.hpp @@ -34,7 +34,8 @@ namespace kfr template <typename T = fbase> auto jaehne(identity<T> magn, size_t size) { - return truncate(magn * sin(c_pi<T, 1, 2> * sqr(linspace(T(0), T(size), size, false)) / size), size); + return truncate(magn * sin(constants<T>::pi_s(1, 2) * sqr(linspace(T(0), T(size), size, false)) / size), + size); } template <typename T = fbase> @@ -60,78 +61,78 @@ auto phasor(identity<T> frequency, identity<T> sample_rate) namespace intrinsics { template <typename T> -KFR_SINTRIN T rawsine(T x) +KFR_SINTRIN T rawsine(const T& x) { - return intrinsics::fastsin(x * c_pi<T, 2>); + return intrinsics::fastsin(x * constants<T>::pi_s(2)); } template <typename T> -KFR_SINTRIN T sinenorm(T x) +KFR_SINTRIN T sinenorm(const T& x) { return intrinsics::rawsine(fract(x)); } template <typename T> -KFR_SINTRIN T sine(T x) +KFR_SINTRIN T sine(const T& x) { - return intrinsics::sinenorm(c_recip_pi<T, 1, 2> * x); + return intrinsics::sinenorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_SINTRIN T rawsquare(T x) +KFR_SINTRIN T rawsquare(const T& x) { return select(x < T(0.5), T(1), -T(1)); } template <typename T> -KFR_SINTRIN T squarenorm(T x) +KFR_SINTRIN T squarenorm(const T& x) { return intrinsics::rawsquare(fract(x)); } template <typename T> -KFR_SINTRIN T square(T x) +KFR_SINTRIN T square(const T& x) { - return intrinsics::squarenorm(c_recip_pi<T, 1, 2> * x); + return intrinsics::squarenorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_SINTRIN T rawsawtooth(T x) +KFR_SINTRIN T rawsawtooth(const T& x) { return T(1) - 2 * x; } template <typename T> -KFR_SINTRIN T sawtoothnorm(T x) +KFR_SINTRIN T sawtoothnorm(const T& x) { return intrinsics::rawsawtooth(fract(x)); } template <typename T> -KFR_SINTRIN T sawtooth(T x) +KFR_SINTRIN T sawtooth(const T& x) { - return intrinsics::sawtoothnorm(c_recip_pi<T, 1, 2> * x); + return intrinsics::sawtoothnorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_SINTRIN T isawtoothnorm(T x) +KFR_SINTRIN T isawtoothnorm(const T& x) { return T(-1) + 2 * fract(x + 0.5); } template <typename T> -KFR_SINTRIN T isawtooth(T x) +KFR_SINTRIN T isawtooth(const T& x) { - return intrinsics::isawtoothnorm(c_recip_pi<T, 1, 2> * x); + return intrinsics::isawtoothnorm(constants<T>::recip_pi_s(1, 2) * x); } template <typename T> -KFR_SINTRIN T rawtriangle(T x) +KFR_SINTRIN T rawtriangle(const T& x) { return 1 - abs(4 * x - 2); } template <typename T> -KFR_SINTRIN T trianglenorm(T x) +KFR_SINTRIN T trianglenorm(const T& x) { return intrinsics::rawtriangle(fract(x + 0.25)); } template <typename T> -KFR_SINTRIN T triangle(T x) +KFR_SINTRIN T triangle(const T& x) { - return intrinsics::trianglenorm(c_recip_pi<T, 1, 2> * x); + return intrinsics::trianglenorm(constants<T>::recip_pi_s(1, 2) * x); } } KFR_I_FN(rawsine) diff --git a/include/kfr/dsp/sample_rate_conversion.hpp b/include/kfr/dsp/sample_rate_conversion.hpp @@ -297,13 +297,13 @@ struct expression_downsample<4, offset, E> : expression<E> } template <typename E1, size_t offset = 0> -CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize<0>) +CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize_t<0>()) { return internal::expression_downsample<2, offset, E1>(std::forward<E1>(e1)); } template <typename E1, size_t offset = 0> -CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize<0>) +CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize_t<0>()) { return internal::expression_downsample<4, offset, E1>(std::forward<E1>(e1)); } diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp @@ -37,28 +37,28 @@ using sample_rate_t = double; namespace intrinsics { template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF amp_to_dB(T amp) +KFR_SINTRIN TF amp_to_dB(const T& amp) { return log(static_cast<TF>(amp)) * subtype<TF>(8.6858896380650365530225783783322); // return T( 20.0 ) * log10( level ); } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF dB_to_amp(T dB) +KFR_SINTRIN TF dB_to_amp(const T& dB) { return exp(dB * subtype<TF>(0.11512925464970228420089957273422)); // return exp10( dB / 20 ); } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF amp_to_dB(T amp, T offset) +KFR_SINTRIN TF amp_to_dB(const T& amp, const T& offset) { return log_fmadd(amp, subtype<TF>(8.6858896380650365530225783783322), offset); // return T( 20.0 ) * log10( level ); } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF dB_to_amp(T dB, T offset) +KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset) { auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset; return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs); @@ -66,13 +66,13 @@ KFR_SINTRIN TF dB_to_amp(T dB, T offset) } template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout power_to_dB(T x) +KFR_SINTRIN Tout power_to_dB(const T& x) { return log(x) * (10 * c_recip_log_10<Tout>); } template <typename T, typename Tout = flt_type<T>> -KFR_SINTRIN Tout dB_to_power(T x) +KFR_SINTRIN Tout dB_to_power(const T& x) { if (x == -c_infinity<Tout>) return 0.0; @@ -81,7 +81,7 @@ KFR_SINTRIN Tout dB_to_power(T x) } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF note_to_hertz(T note) +KFR_SINTRIN TF note_to_hertz(const T& note) { const subtype<TF> offset = 2.1011784386926213177653145771814; @@ -89,7 +89,7 @@ KFR_SINTRIN TF note_to_hertz(T note) } template <typename T, typename TF = flt_type<T>> -KFR_SINTRIN TF hertz_to_note(T hertz) +KFR_SINTRIN TF hertz_to_note(const T& hertz) { const subtype<TF> offset = -36.376316562295915248836189714583; @@ -97,7 +97,7 @@ KFR_SINTRIN TF hertz_to_note(T hertz) } template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>> -KFR_SINTRIN Tc note_to_hertz(T1 note, T2 tunenote, T3 tunehertz) +KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz) { const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818); @@ -105,7 +105,7 @@ KFR_SINTRIN Tc note_to_hertz(T1 note, T2 tunenote, T3 tunehertz) } template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>> -KFR_SINTRIN Tc hertz_to_note(T1 hertz, T2 tunenote, T3 tunehertz) +KFR_SINTRIN Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz) { const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023); diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp @@ -604,10 +604,10 @@ CMT_NOINLINE expression_pointer<T> window(size_t size, window_type type, identit ctype_t<T> = ctype_t<T>()) { return cswitch( - cvals<window_type, window_type::rectangular, window_type::triangular, window_type::bartlett, - window_type::cosine, window_type::hann, window_type::bartlett_hann, window_type::hamming, - window_type::bohman, window_type::blackman, window_type::blackman_harris, window_type::kaiser, - window_type::flattop, window_type::gaussian, window_type::lanczos>, + cvals_t<window_type, window_type::rectangular, window_type::triangular, window_type::bartlett, + window_type::cosine, window_type::hann, window_type::bartlett_hann, window_type::hamming, + window_type::bohman, window_type::blackman, window_type::blackman_harris, window_type::kaiser, + window_type::flattop, window_type::gaussian, window_type::lanczos>(), type, [=](auto win) { constexpr window_type window = val_of(decltype(win)()); diff --git a/include/kfr/io/audiofile.hpp b/include/kfr/io/audiofile.hpp @@ -44,7 +44,8 @@ void write_interleaved(E1&& dest, const univector2d<Tin, Tag1, Tag2>& src) } else if (channels == 2) { - process(std::forward<E1>(dest), pack(src[0], src[1]), 0, infinite_size, nullptr, nullptr, csize<2>); + process(std::forward<E1>(dest), pack(src[0], src[1]), 0, infinite_size, nullptr, nullptr, + csize_t<2>()); } else { @@ -89,7 +90,8 @@ constexpr range<fmax> audio_range<f64>() inline size_t get_audiobitdepth(audiodatatype type) { - return (size_t[]){ 0, 16, 24, 24, 32, 32, 64 }[static_cast<int>(type)]; + constexpr size_t bits[]{ 0, 16, 24, 24, 32, 32, 64 }; + return static_cast<size_t>(type) < 7 ? bits[static_cast<size_t>(type)] : 0; } template <typename T> @@ -130,6 +132,24 @@ static constexpr u32 FourCC(const char (&ch)[5]) return u32(u8(ch[0])) | u32(u8(ch[1])) << 8 | u32(u8(ch[2])) << 16 | u32(u8(ch[3])) << 24; } +constexpr u32 cWAVE_FORMAT_PCM = 1; +constexpr u32 cWAVE_FORMAT_IEEE = 3; + +constexpr u32 ccRIFF = FourCC("RIFF"); +constexpr u32 ccWAVE = FourCC("WAVE"); +constexpr u32 ccfmt = FourCC("fmt "); +constexpr u32 ccdata = FourCC("data"); + +constexpr u32 ccFORM = FourCC("FORM"); +constexpr u32 ccAIFF = FourCC("AIFF"); +constexpr u32 ccAIFC = FourCC("AIFC"); +constexpr u32 ccCOMM = FourCC("COMM"); +constexpr u32 ccSSND = FourCC("SSND"); +constexpr u32 ccNONE = FourCC("NONE"); +constexpr u32 ccsowt = FourCC("sowt"); + +CMT_PRAGMA_PACK_PUSH_1 + struct WAV_FMT { i32 fId; // 'fmt ' @@ -140,20 +160,20 @@ struct WAV_FMT i32 nAvgBytesPerSec; i16 numBlockAlingn; i16 numBitsPerSample; -} __attribute__((packed)); +} CMT_GNU_PACKED; struct WAV_DATA { i32 dId; // 'data' or 'fact' i32 dLen; u8 data[1]; -} __attribute__((packed)); +} CMT_GNU_PACKED; struct WAV_DATA_HDR { i32 dId; // 'data' or 'fact' i32 dLen; -} __attribute__((packed)); +} CMT_GNU_PACKED; struct AIFF_FMT { @@ -164,53 +184,39 @@ struct AIFF_FMT i16 bitsPerSample; f80 sampleRate; i32 compression; -} __attribute__((packed)); +} CMT_GNU_PACKED; struct AIFF_DATA { i32 chunkID; i32 chunkLen; u32 offset; -} __attribute__((packed)); - -constexpr u32 cWAVE_FORMAT_PCM = 1; -constexpr u32 cWAVE_FORMAT_IEEE = 3; - -constexpr u32 ccRIFF = FourCC("RIFF"); -constexpr u32 ccWAVE = FourCC("WAVE"); -constexpr u32 ccfmt = FourCC("fmt "); -constexpr u32 ccdata = FourCC("data"); - -constexpr u32 ccFORM = FourCC("FORM"); -constexpr u32 ccAIFF = FourCC("AIFF"); -constexpr u32 ccAIFC = FourCC("AIFC"); -constexpr u32 ccCOMM = FourCC("COMM"); -constexpr u32 ccSSND = FourCC("SSND"); -constexpr u32 ccNONE = FourCC("NONE"); -constexpr u32 ccsowt = FourCC("sowt"); +} CMT_GNU_PACKED; struct RIFF_HDR { i32 riffID; // 'RIFF' or 'COMM' i32 fileLen; i32 formatID; // 'WAVE' or 'AIFF' -} __attribute__((packed)); +} CMT_GNU_PACKED; struct WAV_HEADER { RIFF_HDR riff; WAV_FMT fmt; WAV_DATA_HDR data; - -} __attribute__((packed)); +} CMT_GNU_PACKED; struct CHUNK_HDR { i32 chunkID; i32 chunkLen; -} __attribute__((packed)); +} CMT_GNU_PACKED; + +CMT_PRAGMA_PACK_POP -static bool audio_test_wav(const array_ref<u8>& rawbytes) +template <size_t = 0> +bool audio_test_wav(const array_ref<u8>& rawbytes) { if (rawbytes.size() < sizeof(RIFF_HDR)) { @@ -228,7 +234,8 @@ static bool audio_test_wav(const array_ref<u8>& rawbytes) return true; } -static bool audio_test_aiff(const array_ref<u8>& rawbytes) +template <size_t = 0> +bool audio_test_aiff(const array_ref<u8>& rawbytes) { if (rawbytes.size() < sizeof(RIFF_HDR)) { @@ -255,7 +262,8 @@ enum class file_status unsupported_bit_format }; -static file_status audio_info_wav(audioformat& info, const array_ref<u8>& rawbytes) +template <size_t = 0> +file_status audio_info_wav(audioformat& info, const array_ref<u8>& rawbytes) { const CHUNK_HDR* chunk = ptr_cast<CHUNK_HDR>(rawbytes.data() + 12); const void* end = ptr_cast<char>(rawbytes.end()); @@ -324,7 +332,8 @@ static file_status audio_info_wav(audioformat& info, const array_ref<u8>& rawbyt return file_status::ok; } -static file_status audio_info(audioformat& info, const array_ref<u8>& file_bytes) +template <size_t = 0> +file_status audio_info(audioformat& info, const array_ref<u8>& file_bytes) { if (audio_test_wav(file_bytes)) return audio_info_wav(info, file_bytes); diff --git a/include/kfr/io/python_plot.hpp b/include/kfr/io/python_plot.hpp @@ -40,9 +40,9 @@ namespace kfr { namespace internal { -#pragma clang diagnostic push +CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wdeprecated-declarations") -#pragma clang diagnostic ignored "-Wdeprecated-declarations" +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wdeprecated-declarations") #endif template <int = 0> @@ -66,7 +66,7 @@ void python(const std::string& name, const std::string& code) fclose(f); std::system(("python \"" + filename + "\"").c_str()); } -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) } inline std::string concat_args() { return {}; } diff --git a/include/kfr/io/tostring.hpp b/include/kfr/io/tostring.hpp @@ -111,6 +111,7 @@ inline std::string array_to_string(const kfr::complex<T>* source, size_t N) template <typename T> struct representation<kfr::complex<T>> { + using type = std::string; static std::string get(const kfr::complex<T>& value) { return as_string(value.real()) + " + " + as_string(value.imag()) + "j"; @@ -120,12 +121,14 @@ struct representation<kfr::complex<T>> template <> struct representation<kfr::cpu_t> { + using type = std::string; static std::string get(kfr::cpu_t value) { return kfr::cpu_name(value); } }; template <typename T, size_t N> struct representation<kfr::vec<T, N>> { + using type = std::string; static std::string get(const kfr::vec<T, N>& value) { return details::array_to_string(value.data(), value.size()); @@ -135,6 +138,7 @@ struct representation<kfr::vec<T, N>> template <typename T, size_t Tag> struct representation<kfr::univector<T, Tag>> { + using type = std::string; static std::string get(const kfr::univector<T, Tag>& value) { return details::array_to_string(value.data(), value.size()); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -67,28 +67,33 @@ else () set(EMULATOR "") endif () +#get_cmake_property(_variableNames VARIABLES) +#foreach (_variableName ${_variableNames}) +# message(STATUS "${_variableName}=${${_variableName}}") +#endforeach() + if (NOT IOS) enable_testing() add_test(NAME base_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/base_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/base_test) add_test(NAME intrinsic_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/intrinsic_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/intrinsic_test) add_test(NAME dsp_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/dsp_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dsp_test) if (MPFR_FOUND) add_test(NAME transcendental_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/transcendental_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/transcendental_test) endif () add_test(NAME complex_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/complex_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/complex_test) add_test(NAME expression_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/expression_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/expression_test) if (NOT ARM) add_test(NAME multiarch - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/multiarch) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/multiarch) endif () add_test(NAME dft_test - COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/tests/dft_test) + COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dft_test) endif () diff --git a/tests/base_test.cpp b/tests/base_test.cpp @@ -76,16 +76,17 @@ TEST(test_basic) CHECK(even(numbers2) == vec<int, 4>{ 100, 102, 104, 106 }); // * The following command pairs are equivalent: - CHECK(permute(numbers1, elements<0, 2, 1, 3, 4, 6, 5, 7>) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); - CHECK(permute(numbers1, elements<0, 2, 1, 3>) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); + CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); + CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 }); - CHECK(shuffle(numbers1, numbers2, elements<0, 8, 2, 10, 4, 12, 6, 14>) == + CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); - CHECK(shuffle(numbers1, numbers2, elements<0, 8>) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); + CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 }); - CHECK(blend(numbers1, numbers2, elements<0, 1, 1, 0, 1, 1, 0, 1>) == + CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) == + vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); + CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) == vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); - CHECK(blend(numbers1, numbers2, elements<0, 1, 1>) == vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 }); // * Transpose matrix: const auto sixteen = enumerate<float, 16>(); @@ -154,6 +155,9 @@ TEST(vec_zerovector) TEST(vec_allonesvector) { + CHECK(bitcast<u32>(constants<f32>::allones()) == 0xFFFFFFFFu); + CHECK(bitcast<u64>(constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull); + CHECK(~allonesvector<f32, 3>() == f32x3{ 0, 0, 0 }); CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 }); CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 }); @@ -221,6 +225,10 @@ TEST(vec_is_convertible) static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, ""); static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, ""); + testo::assert_is_same<i32x4, common_type<i32x4>>(); + testo::assert_is_same<u32x4, common_type<i32x4, u32x4>>(); + testo::assert_is_same<f64x4, common_type<i32x4, u32x4, f64x4>>(); + CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f }); CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 }); CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 }); @@ -282,7 +290,7 @@ TEST(test_stat) } } -int main(int argc, char** argv) +int main() { println(library_version()); return testo::run_all("", true); diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp @@ -9,7 +9,6 @@ #include <kfr/io.hpp> using namespace kfr; -using testo::assert_is_same; TEST(complex_vector) { @@ -67,7 +66,7 @@ TEST(complex_math) CHECK(a - b == make_vector(c32{ -2, -2 })); CHECK(a * b == make_vector(c32{ -5, 10 })); CHECK(a * 2 == make_vector(c32{ 2, 4 })); - CHECK(a / b == make_vector(c32{ 0.44, 0.08 })); + CHECK(a / b == make_vector(c32{ 0.44f, 0.08f })); CHECK(-a == make_vector(c32{ -1, -2 })); CHECK(real(a) == make_vector(1.f)); @@ -86,26 +85,26 @@ TEST(complex_math) testo::epsilon<f32>() *= 5; testo::epsilon<f64>() *= 5; - CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773, 0.634963914784736 }); - CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489, -0.9888977057628651 }); - CHECK(csinh(c32{ 1.f, 1.f }) == c32{ 0.634963914784736, 1.2984575814159773 }); - CHECK(ccosh(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489, 0.9888977057628651 }); + CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773f, 0.634963914784736f }); + CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, -0.9888977057628651f }); + CHECK(csinh(c32{ 1.f, 1.f }) == c32{ 0.634963914784736f, 1.2984575814159773f }); + CHECK(ccosh(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, 0.9888977057628651f }); - CHECK(clog(c32{ 1.f, 1.f }) == c32{ 0.34657359027997264, 0.7853981633974483 }); - CHECK(clog2(c32{ 1.f, 1.f }) == c32{ 0.5, 1.1330900354567983 }); - CHECK(clog10(c32{ 1.f, 1.f }) == c32{ 0.15051499783199057, 0.3410940884604603 }); + CHECK(clog(c32{ 1.f, 1.f }) == c32{ 0.34657359027997264f, 0.7853981633974483f }); + CHECK(clog2(c32{ 1.f, 1.f }) == c32{ 0.5f, 1.1330900354567983f }); + CHECK(clog10(c32{ 1.f, 1.f }) == c32{ 0.15051499783199057f, 0.3410940884604603f }); - CHECK(cexp(c32{ 1.f, 1.f }) == c32{ 1.4686939399158849, 2.2873552871788423 }); - CHECK(cexp2(c32{ 1.f, 1.f }) == c32{ 1.5384778027279442, 1.2779225526272695 }); - CHECK(cexp10(c32{ 1.f, 1.f }) == c32{ -6.682015101903131, 7.439803369574931 }); + CHECK(cexp(c32{ 1.f, 1.f }) == c32{ 1.4686939399158849f, 2.2873552871788423f }); + CHECK(cexp2(c32{ 1.f, 1.f }) == c32{ 1.5384778027279442f, 1.2779225526272695f }); + CHECK(cexp10(c32{ 1.f, 1.f }) == c32{ -6.682015101903131f, 7.439803369574931f }); #ifdef KFR_NATIVE_F64 - CHECK(csin(c64{ 1.f, 1.f }) == c64{ 1.2984575814159773, 0.634963914784736 }); - CHECK(ccos(c64{ 1.f, 1.f }) == c64{ 0.8337300251311489, -0.9888977057628651 }); - CHECK(csinh(c64{ 1.f, 1.f }) == c64{ 0.634963914784736, 1.2984575814159773 }); - CHECK(ccosh(c64{ 1.f, 1.f }) == c64{ 0.8337300251311489, 0.9888977057628651 }); - CHECK(clog(c64{ 1.f, 1.f }) == c64{ 0.34657359027997264, 0.7853981633974483 }); - CHECK(cexp(c64{ 1.f, 1.f }) == c64{ 1.4686939399158849, 2.2873552871788423 }); + CHECK(csin(c64{ 1.0, 1.0 }) == c64{ 1.2984575814159773, 0.634963914784736 }); + CHECK(ccos(c64{ 1.0, 1.0 }) == c64{ 0.8337300251311489, -0.9888977057628651 }); + CHECK(csinh(c64{ 1.0, 1.0 }) == c64{ 0.634963914784736, 1.2984575814159773 }); + CHECK(ccosh(c64{ 1.0, 1.0 }) == c64{ 0.8337300251311489, 0.9888977057628651 }); + CHECK(clog(c64{ 1.0, 1.0 }) == c64{ 0.34657359027997264, 0.7853981633974483 }); + CHECK(cexp(c64{ 1.0, 1.0 }) == c64{ 1.4686939399158849, 2.2873552871788423 }); #endif } @@ -161,19 +160,18 @@ TEST(complex_function_expressions) CHECK(uv3[1] == 2.f); CHECK(uv3[2] == 8.f); CHECK(uv3[3] == 18.f); - - assert_is_same<c32, value_type_of<decltype(uv2)>>(); - assert_is_same<f32, value_type_of<decltype(uv3)>>(); - assert_is_same<f32, value_type_of<decltype(real(uv2))>>(); + testo::assert_is_same<c32, value_type_of<decltype(uv2)>>(); + testo::assert_is_same<f32, value_type_of<decltype(uv3)>>(); + testo::assert_is_same<f32, value_type_of<decltype(real(uv2))>>(); } TEST(static_tests) { #ifdef CMT_ARCH_SSE2 - static_assert(vector_width<f32, cpu_t::sse2> == 4, ""); - static_assert(vector_width<c32, cpu_t::sse2> == 2, ""); - static_assert(vector_width<i32, cpu_t::sse2> == 4, ""); - static_assert(vector_width<complex<i32>, cpu_t::sse2> == 2, ""); + static_assert(platform<f32, cpu_t::sse2>::vector_width == 4, ""); + static_assert(platform<c32, cpu_t::sse2>::vector_width == 2, ""); + static_assert(platform<i32, cpu_t::sse2>::vector_width == 4, ""); + static_assert(platform<complex<i32>, cpu_t::sse2>::vector_width == 2, ""); #endif static_assert(is_numeric<vec<complex<float>, 4>>::value, ""); @@ -184,26 +182,26 @@ TEST(static_tests) static_assert(vec<c32, 4>::size() == 4, ""); static_assert(vec<f32, 4>::scalar_size() == 4, ""); static_assert(vec<c32, 4>::scalar_size() == 8, ""); - assert_is_same<subtype<complex<i32>>, i32>(); - assert_is_same<vec<c32, 4>::value_type, c32>(); - assert_is_same<vec<c32, 4>::scalar_type, f32>(); - assert_is_same<vec<f32, 4>::value_type, f32>(); - assert_is_same<vec<f32, 4>::scalar_type, f32>(); - assert_is_same<vec<c32, 1>, decltype(make_vector(c32{ 0, 0 }))>(); - assert_is_same<vec<c32, 2>, decltype(make_vector(c32{ 0, 0 }, 4))>(); - assert_is_same<ftype<complex<i32>>, complex<f32>>(); - assert_is_same<ftype<complex<i64>>, complex<f64>>(); - assert_is_same<ftype<vec<complex<i32>, 4>>, vec<complex<f32>, 4>>(); - assert_is_same<ftype<vec<complex<i64>, 8>>, vec<complex<f64>, 8>>(); - - assert_is_same<kfr::internal::arg<int>, kfr::internal::expression_scalar<int, 1>>(); - assert_is_same<kfr::internal::arg<complex<int>>, - kfr::internal::expression_scalar<kfr::complex<int>, 1>>(); - - assert_is_same<common_type<complex<int>, double>, complex<double>>(); + testo::assert_is_same<subtype<complex<i32>>, i32>(); + testo::assert_is_same<vec<c32, 4>::value_type, c32>(); + testo::assert_is_same<vec<c32, 4>::scalar_type, f32>(); + testo::assert_is_same<vec<f32, 4>::value_type, f32>(); + testo::assert_is_same<vec<f32, 4>::scalar_type, f32>(); + testo::assert_is_same<vec<c32, 1>, decltype(make_vector(c32{ 0, 0 }))>(); + testo::assert_is_same<vec<c32, 2>, decltype(make_vector(c32{ 0, 0 }, 4))>(); + testo::assert_is_same<ftype<complex<i32>>, complex<f32>>(); + testo::assert_is_same<ftype<complex<i64>>, complex<f64>>(); + testo::assert_is_same<ftype<vec<complex<i32>, 4>>, vec<complex<f32>, 4>>(); + testo::assert_is_same<ftype<vec<complex<i64>, 8>>, vec<complex<f64>, 8>>(); + + testo::assert_is_same<kfr::internal::arg<int>, kfr::internal::expression_scalar<int, 1>>(); + testo::assert_is_same<kfr::internal::arg<complex<int>>, + kfr::internal::expression_scalar<kfr::complex<int>, 1>>(); + + testo::assert_is_same<common_type<complex<int>, double>, complex<double>>(); } -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -92,7 +92,7 @@ TEST(fft_accuracy) }); } -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp @@ -6,7 +6,6 @@ #include "testo/testo.hpp" #include <kfr/base.hpp> -#include <kfr/dft.hpp> #include <kfr/dsp.hpp> #include <kfr/io.hpp> @@ -21,7 +20,7 @@ TEST(delay) CHECK(v2[2] == 101); CHECK(v2[19] == 118); - const univector<float, 33> v3 = delay(v1, csize<3>); + const univector<float, 33> v3 = delay(v1, csize_t<3>()); CHECK(v3[0] == 0); CHECK(v3[1] == 0); CHECK(v3[2] == 0); @@ -34,16 +33,16 @@ TEST(fracdelay) { univector<double, 5> a({ 1, 2, 3, 4, 5 }); univector<double, 5> b = fracdelay(a, 0.5); - CHECK(rms(b - univector<double>({ 0.5, 1.5, 2.5, 3.5, 4.5 })) < c_epsilon<double> * 5); + CHECK(rms(b - univector<double>({ 0.5, 1.5, 2.5, 3.5, 4.5 })) < constants<double>::epsilon * 5); b = fracdelay(a, 0.1); - CHECK(rms(b - univector<double>({ 0.9, 1.9, 2.9, 3.9, 4.9 })) < c_epsilon<double> * 5); + CHECK(rms(b - univector<double>({ 0.9, 1.9, 2.9, 3.9, 4.9 })) < constants<double>::epsilon * 5); b = fracdelay(a, 0.0); - CHECK(rms(b - univector<double>({ 1, 2, 3, 4, 5 })) < c_epsilon<double> * 5); + CHECK(rms(b - univector<double>({ 1, 2, 3, 4, 5 })) < constants<double>::epsilon * 5); b = fracdelay(a, 1.0); - CHECK(rms(b - univector<double>({ 0, 1, 2, 3, 4 })) < c_epsilon<double> * 5); + CHECK(rms(b - univector<double>({ 0, 1, 2, 3, 4 })) < constants<double>::epsilon * 5); } TEST(mixdown) @@ -80,7 +79,7 @@ TEST(phasor) CHECK(rms(v1 - v2) < 1.e-5); } -int main(int argc, char** argv) +int main() { println(library_version()); return testo::run_all("", true); diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp @@ -2,4 +2,4 @@ using namespace kfr; -int main(int argc, char** argv) { return 0; } +int main() {} diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp @@ -166,7 +166,7 @@ TEST(partition) } } -int main(int argc, char** argv) +int main() { println(library_version()); diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp @@ -70,7 +70,7 @@ inline T ref_abs(T x) template <typename T> bool builtin_add_overflow(T x, T y, T* r) { -#if __has_builtin(__builtin_add_overflow) +#if CMT_HAS_BUILTIN(__builtin_add_overflow) || defined CMT_COMPILER_GCC return __builtin_add_overflow(x, y, r); #else *r = x + y; @@ -80,17 +80,27 @@ bool builtin_add_overflow(T x, T y, T* r) template <> bool builtin_add_overflow<u64>(u64 x, u64 y, u64* r) { +#if CMT_HAS_BUILTIN(__builtin_uaddll_overflow) || defined CMT_COMPILER_GCC return __builtin_uaddll_overflow(x, y, reinterpret_cast<unsigned long long*>(r)); +#else + *r = x + y; + return x > 0xFFFFFFFFFFFFFFFFull - y; +#endif } template <> bool builtin_add_overflow<i64>(i64 x, i64 y, i64* r) { +#if CMT_HAS_BUILTIN(__builtin_saddll_overflow) || defined CMT_COMPILER_GCC return __builtin_saddll_overflow(x, y, reinterpret_cast<long long*>(r)); +#else + *r = x + y; + return !((x ^ y) & 0x8000000000000000ull) && ((*r ^ x) & 0x8000000000000000ull); +#endif } template <typename T> bool builtin_sub_overflow(T x, T y, T* r) { -#if __has_builtin(__builtin_sub_overflow) +#if CMT_HAS_BUILTIN(__builtin_sub_overflow) || defined CMT_COMPILER_GCC return __builtin_sub_overflow(x, y, r); #else *r = x - y; @@ -100,12 +110,22 @@ bool builtin_sub_overflow(T x, T y, T* r) template <> bool builtin_sub_overflow<u64>(u64 x, u64 y, u64* r) { +#if CMT_HAS_BUILTIN(__builtin_usubll_overflow) || defined CMT_COMPILER_GCC return __builtin_usubll_overflow(x, y, reinterpret_cast<unsigned long long*>(r)); +#else + *r = x - y; + return x < y; +#endif } template <> bool builtin_sub_overflow<i64>(i64 x, i64 y, i64* r) { +#if CMT_HAS_BUILTIN(__builtin_ssubll_overflow) || defined CMT_COMPILER_GCC return __builtin_ssubll_overflow(x, y, reinterpret_cast<long long*>(r)); +#else + *r = x - y; + return ((x ^ y) & 0x8000000000000000ull) && ((*r ^ x) & 0x8000000000000000ull); +#endif } //#endif template <typename T> @@ -152,7 +172,7 @@ TEST(intrin_abs) using T = type_of<decltype(type)>; using Tsub = subtype<T>; const T x(value); - CHECK(kfr::abs(x) == apply([](auto x) { return Tsub(std::abs(x)); }, x)); + CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x)); }); } @@ -389,4 +409,4 @@ TEST(intrin_math) CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233))); } -int main(int argc, char** argv) { return testo::run_all("", false); } +int main() { return testo::run_all("", false); } diff --git a/tests/multiarch.cpp b/tests/multiarch.cpp @@ -54,4 +54,4 @@ TEST(test_fir_avx) } } -int main(int argc, char** argv) { return testo::run_all("", true); } +int main() { return testo::run_all("", true); } diff --git a/tests/testo/testo.hpp b/tests/testo/testo.hpp @@ -99,7 +99,7 @@ struct comparison comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)) {} - bool operator()() { return cmp(left, right); } + bool operator()() const { return cmp(left, right); } }; template <typename Left, typename Right> @@ -120,8 +120,8 @@ struct equality_comparer bool operator()(const L& l, const R& r) const { return l == r; } }; -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wfloat-equal" +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal") template <typename T> inline T& epsilon() @@ -149,7 +149,7 @@ struct equality_comparer<long double, long double> } }; -#pragma clang diagnostic pop +CMT_PRAGMA_GNU(GCC diagnostic pop) template <typename L, typename R> struct equality_comparer<L, R, void_t<enable_if<!compound_type_traits<L>::is_scalar>>> @@ -179,7 +179,7 @@ struct cmp_eq static const char* op() { return "=="; } template <typename L, typename R> - bool operator()(L&& left, R&& right) + bool operator()(L&& left, R&& right) const { equality_comparer<decay<L>, decay<R>> eq; return eq(left, right); @@ -191,7 +191,7 @@ struct cmp_ne static const char* op() { return "!="; } template <typename L, typename R> - bool operator()(L&& left, R&& right) + bool operator()(L&& left, R&& right) const { return !cmp_eq()(left, right); } @@ -202,7 +202,7 @@ struct cmp_lt static const char* op() { return "<"; } template <typename L, typename R> - bool operator()(L&& left, R&& right) + bool operator()(L&& left, R&& right) const { return left < right; } @@ -213,7 +213,7 @@ struct cmp_gt static const char* op() { return ">"; } template <typename L, typename R> - bool operator()(L&& left, R&& right) + bool operator()(L&& left, R&& right) const { return left > right; } @@ -224,7 +224,7 @@ struct cmp_le static const char* op() { return "<="; } template <typename L, typename R> - bool operator()(L&& left, R&& right) + bool operator()(L&& left, R&& right) const { return left <= right; } @@ -235,7 +235,7 @@ struct cmp_ge static const char* op() { return ">="; } template <typename L, typename R> - bool operator()(L&& left, R&& right) + bool operator()(L&& left, R&& right) const { return left >= right; } @@ -394,14 +394,14 @@ struct test_case } template <typename Op, typename L, typename R> - void check(comparison<Op, L, R> comparison, const char* expr) + void check(const comparison<Op, L, R>& comparison, const char* expr) { bool result = comparison(); check(result, as_string(comparison.left, " ", Op::op(), " ", comparison.right), expr); } template <typename L> - void check(half_comparison<L> comparison, const char* expr) + void check(const half_comparison<L>& comparison, const char* expr) { bool result = comparison.left ? true : false; check(result, as_string(comparison.left), expr); diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp @@ -26,7 +26,8 @@ double ulps(T test, const mpfr::number& ref) TEST(test_sin_cos) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, +c_pi<f64, 2>, 0.05), + testo::matrix(named("type") = ctypes_t<f32, f64>(), + named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -37,7 +38,7 @@ TEST(test_sin_cos) TEST(test_log) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, 100.0, 0.5), + testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -47,7 +48,7 @@ TEST(test_log) TEST(test_log2) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, 100.0, 0.5), + testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -57,7 +58,7 @@ TEST(test_log2) TEST(test_log10) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(0.0, 100.0, 0.5), + testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -67,7 +68,7 @@ TEST(test_log10) TEST(test_exp) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(-10, +10, 0.05), + testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -77,7 +78,7 @@ TEST(test_exp) TEST(test_exp2) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(-10, +10, 0.05), + testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -87,7 +88,7 @@ TEST(test_exp2) TEST(test_exp10) { - testo::matrix(named("type") = ctypes<f32, f64>, named("value") = make_range(-10, +10, 0.05), + testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); @@ -95,7 +96,7 @@ TEST(test_exp10) }); } -int main(int argc, char** argv) +int main() { mpfr::scoped_precision p(128); return testo::run_all("");