kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 51f526fdde45132c83c9c6368a770aed291be3f2
parent 509d0251987260ef1f7845592147120e1158b75b
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Mon, 19 Nov 2018 05:41:10 +0300

KFR 3.0 fixes/tests

Diffstat:
MCMakeLists.txt | 10++--------
Mexamples/CMakeLists.txt | 2+-
Minclude/kfr/base/atan.hpp | 8++++----
Minclude/kfr/base/basic_expressions.hpp | 1+
Minclude/kfr/base/logical.hpp | 3++-
Minclude/kfr/base/random.hpp | 11+++++++----
Minclude/kfr/base/round.hpp | 8++++----
Minclude/kfr/base/select.hpp | 2+-
Minclude/kfr/base/simd_clang.hpp | 2+-
Minclude/kfr/base/sin_cos.hpp | 33+++++++++++++++++----------------
Minclude/kfr/base/tan.hpp | 16++++++++++------
Minclude/kfr/base/vec.hpp | 73++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Minclude/kfr/cident.h | 4++++
Minclude/kfr/dft/convolution.hpp | 133++++++++++++++-----------------------------------------------------------------
Minclude/kfr/dft/dft-src.cpp | 179+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mtests/CMakeLists.txt | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Atests/all_tests.cpp | 22++++++++++++++++++++++
Mtests/base_test.cpp | 2++
Mtests/complex_test.cpp | 2++
Mtests/dft_test.cpp | 12+++++++-----
Mtests/dsp_test.cpp | 2++
Mtests/expression_test.cpp | 2++
Mtests/intrinsic_test.cpp | 26+++++++++++++++++++++-----
Mtests/transcendental_test.cpp | 68+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
24 files changed, 484 insertions(+), 205 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -15,13 +15,7 @@ # along with KFR. -cmake_minimum_required(VERSION 2.8) - -if (CMAKE_VERSION VERSION_LESS "2.8.12") - function(add_compile_options) - add_definitions(${ARGN}) - endfunction(add_compile_options) -endif () +cmake_minimum_required(VERSION 3.0) #set(DISABLE_CLANG 1) @@ -91,7 +85,7 @@ if (${CMAKE_GENERATOR} STREQUAL "MinGW Makefiles" OR ${CMAKE_GENERATOR} STREQUAL "MSYS Makefiles" OR ${CMAKE_GENERATOR} STREQUAL "Unix Makefiles" OR ${CMAKE_GENERATOR} STREQUAL "Ninja") - + if (CMAKE_CXX_COMPILER MATCHES "clang" AND NOT CMAKE_CXX_COMPILER MATCHES "clang-cl") if (WIN32) # On windows, clang requires --target diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt @@ -15,7 +15,7 @@ # along with KFR. -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.0) include_directories(../include) diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp @@ -114,8 +114,8 @@ KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x) r = mulsign(r, x); r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r); r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r); - r = select(y == 0.0f, (x < 0) & pi, r); - r = (isnan(x) || isnan(y)) | mulsign(r, y); + r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r); + r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y); return r; } @@ -129,8 +129,8 @@ KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x) r = mulsign(r, x); r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r); r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r); - r = select(y == 0.0, (x < 0) & pi, r); - r = (isnan(x) || isnan(y)) | mulsign(r, y); + r = select(y == 0.0, select(x < 0., pi, 0.), r); + r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y); return r; } diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp @@ -27,6 +27,7 @@ #include "univector.hpp" #include "vec.hpp" +#include "operators.hpp" #include <algorithm> namespace kfr diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -37,7 +37,8 @@ namespace intrinsics template <size_t bits> struct bitmask { - using type = findinttype<0, (1ull << bits) - 1>; + using type = conditional<(bits > 32), uint64_t, + conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>; bitmask(type val) : value(val) {} diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp @@ -82,11 +82,14 @@ inline vec<u8, N> random_bits(random_bit_generator& gen) { return narrow<N>(bitcast<u8>(gen())); } + template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))> inline vec<u8, N> random_bits(random_bit_generator& gen) { - constexpr size_t N2 = prev_poweroftwo(N - 1); - return concat(random_bits<N2>(gen), random_bits<N - N2>(gen)); + constexpr size_t N2 = prev_poweroftwo(N - 1); + const vec<u8, N2> bits1 = random_bits<N2>(gen); + const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen); + return concat(bits1, bits2); } template <typename T, size_t N, KFR_ENABLE_IF(std::is_integral<T>::value)> @@ -162,7 +165,7 @@ struct expression_random_range : input_expression const T min; const T max; }; -} +} // namespace internal template <typename T> inline internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen) @@ -187,4 +190,4 @@ inline internal::expression_random_range<T> gen_random_range(T min, T max) { return internal::expression_random_range<T>(random_bit_generator(seed_from_rdtsc), min, max); } -} +} // namespace kfr diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp @@ -116,25 +116,25 @@ template <size_t N> KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x) { vec<f32, N> t = cast<f32>(cast<i32>(x)); - return t - ((x < t) & 1.f); + return t - select(x < t, 1.f, 0.f); } template <size_t N> KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x) { vec<f64, N> t = cast<f64>(cast<i64>(x)); - return t - ((x < t) & 1.0); + return t - select(x < t, 1., 0.); } template <size_t N> KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x) { vec<f32, N> t = cast<f32>(cast<i32>(x)); - return t + ((x > t) & 1.f); + return t + select(x > t, 1.f, 0.f); } template <size_t N> KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x) { vec<f64, N> t = cast<f64>(cast<i64>(x)); - return t + ((x > t) & 1.0); + return t + select(x > t, 1., 0.); } template <size_t N> KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x) diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -167,7 +167,7 @@ KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, co template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { - return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c))); + return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) diff --git a/include/kfr/base/simd_clang.hpp b/include/kfr/base/simd_clang.hpp @@ -189,7 +189,7 @@ struct alignas(alignof(internal::simd_type<T, N>)) vec : public vec_t<T, N> constexpr friend mask_t operator<=(const vec& x, const vec& y) noexcept { return KFR_S(*x <= *y); } constexpr friend mask_t operator>=(const vec& x, const vec& y) noexcept { return KFR_S(*x >= *y); } - constexpr mask_t asmask() const noexcept { return mask_t(*this); } + constexpr mask_t asmask() const noexcept { return mask_t(simd); } #undef KFR_S #undef KFR_U diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp @@ -61,14 +61,14 @@ template <typename T, size_t N, typename Tprecise = f64> KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant) { const vec<T, N> xabs = abs(x); - constexpr T div = constants<T>::fold_constant_div; - vec<T, N> y = floor(xabs / div); - quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0)); + constexpr T div = constants<T>::fold_constant_div; + vec<T, N> y = floor(xabs / div); + quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0)); const mask<T, N> msk = (quadrant & 1) != 0; - quadrant = kfr::select(msk, quadrant + 1, quadrant); - y = select(msk, y + T(1.0), y); - quadrant = quadrant & 7; + quadrant = kfr::select(msk, quadrant + 1, quadrant); + y = select(msk, y + T(1.0), y); + quadrant = quadrant & 7; constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi); constexpr T rem1 = constants<T>::fold_constant_rem1; @@ -134,10 +134,11 @@ KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cos vec<itype<T>, N> quadrant; vec<T, N> folded = trig_fold(x_full, quadrant); - mask<T, N> flip_sign = kfr::select(cosmask, (quadrant == 2) || (quadrant == 4), quadrant >= 4); + mask<T, N> flip_sign = + kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask(); mask<T, N> usecos = (quadrant == 2) || (quadrant == 6); - usecos = usecos ^ cosmask; + usecos = usecos ^ cosmask; vec<T, N> formula = trig_sincos(folded, usecos); @@ -160,7 +161,7 @@ KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x) vec<T, N> formula = trig_sincos(folded, usecos); - formula = select(flip_sign ^ x, -formula, formula); + formula = select(flip_sign ^ mask<T, N>(x), -formula, formula); return formula; } @@ -193,15 +194,15 @@ KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x) vec<T, N> xx = x - pi; vec<T, N> y = abs(xx); - y = select(y > c_pi<T, 1, 2>, pi - y, y); - y = y ^ (msk & ~xx); + y = select(y > c_pi<T, 1, 2>, pi - y, y); + y = y ^ (msk & ~xx); vec<T, N> y2 = y * y; vec<T, N> formula = c6; vec<T, N> y3 = y2 * y; - formula = fmadd(formula, y2, c4); - formula = fmadd(formula, y2, c2); - formula = formula * y3 + y; + formula = fmadd(formula, y2, c4); + formula = fmadd(formula, y2, c2); + formula = formula * y3 + y; return formula; } @@ -316,7 +317,7 @@ KFR_SINTRIN Tout cossindeg(const T& x) { return cossin(x * constants<Tout>::degtorad); } -} +} // namespace intrinsics KFR_I_FN(sin) KFR_I_FN(cos) @@ -616,4 +617,4 @@ KFR_SINTRIN T cos3x(const T& sinx, const T& cosx) { return cosx * (1 - 4 * sqr(sinx)); } -} +} // namespace kfr diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp @@ -51,10 +51,10 @@ KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inve vec<T, N> x = y - k_real * pi_14; mask<T, N> need_offset = (k & 1) != 0; - x = select(need_offset, x - pi_14, x); + x = select(need_offset, x - pi_14, x); vec<IT, N> k_mod4 = k & 3; - inverse = (k_mod4 == 1) || (k_mod4 == 2); + inverse = (k_mod4 == 1) || (k_mod4 == 2); return x; } @@ -62,7 +62,9 @@ template <size_t N> KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full) { mask<f32, N> inverse; - const vec<f32, N> x = trig_fold_simple(x_full, inverse); + vec<i32, N> quad; + const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse); + inverse = quad == 2 || quad == 6; constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01); constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01); @@ -90,7 +92,9 @@ template <size_t N> KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full) { mask<f64, N> inverse; - const vec<f64, N> x = trig_fold_simple(x_full, inverse); + vec<i64, N> quad; + const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse); + inverse = quad == 2 || quad == 6; constexpr f64 tan_c2 = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01); constexpr f64 tan_c4 = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01); @@ -129,7 +133,7 @@ KFR_SINTRIN flt_type<T> tandeg(const T& x) { return tan(x * c_degtorad<flt_type<T>>); } -} +} // namespace intrinsics KFR_I_FN(tan) KFR_I_FN(tandeg) @@ -156,4 +160,4 @@ KFR_FUNC internal::expression_function<fn::tandeg, E1> tandeg(E1&& x) { return { fn::tandeg(), std::forward<E1>(x) }; } -} +} // namespace kfr diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp @@ -37,7 +37,7 @@ template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)> CMT_INLINE vec<T, Nout> low(const vec<T, N>& x); template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)> CMT_INLINE vec<T, Nout> high(const vec<T, N>& x); -} +} // namespace kfr #ifdef CMT_COMPILER_CLANG #include "simd_clang.hpp" @@ -65,7 +65,7 @@ template <typename T> using maskfor = typename T::mask_t; template <typename T, size_t N> -struct mask : vec<T, N> +struct mask : protected vec<T, N> { using base = vec<T, N>; KFR_I_CE mask() noexcept = default; @@ -73,17 +73,31 @@ struct mask : vec<T, N> KFR_I_CE mask& operator=(const mask&) noexcept = default; using simd_type = typename base::simd_type; - KFR_I_CE mask(const base& v) noexcept : base(v) {} + simd_type operator*() const noexcept { return this->simd; } + simd_type& operator*() noexcept { return this->simd; } + + KFR_I_CE mask(const base& v) noexcept + : base(base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec())) + { + } KFR_I_CE mask(const simd_type& simd) : base(simd) {} template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))> - KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m)) + KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m.asvec())) { } template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))> KFR_I_CE mask(const vec<U, N>& m) : base(base::frombits(m)) { } + KFR_I_CE mask operator&(const mask& y) const noexcept + { + return static_cast<const base&>(*this) & static_cast<const base&>(y); + } + KFR_I_CE mask operator|(const mask& y) const noexcept + { + return static_cast<const base&>(*this) | static_cast<const base&>(y); + } KFR_I_CE mask operator&&(const mask& y) const noexcept { return static_cast<const base&>(*this) & static_cast<const base&>(y); @@ -96,7 +110,7 @@ struct mask : vec<T, N> { return static_cast<const base&>(*this) ^ static_cast<const base&>(y); } - KFR_I_CE mask operator^(const base& y) const noexcept { return static_cast<const base&>(*this) ^ y; } + KFR_I_CE mask operator~() const noexcept { return ~static_cast<const base&>(*this); } bool operator[](size_t index) const noexcept; @@ -123,7 +137,7 @@ constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> { return {}; } -} +} // namespace internal template <size_t groupsize, size_t... indices> constexpr inline auto scale() noexcept @@ -209,13 +223,13 @@ struct vec<vec<T, Nin>, N> : private vec<T, Nin * N> constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); } constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); } - constexpr friend vec operator++(vec& x, int)noexcept + constexpr friend vec operator++(vec& x, int) noexcept { const vec z = x; ++x; return z; } - constexpr friend vec operator--(vec& x, int)noexcept + constexpr friend vec operator--(vec& x, int) noexcept { const vec z = x; --x; @@ -256,10 +270,9 @@ struct vec<vec<T, Nin>, N> : private vec<T, Nin * N> CMT_GNU_CONSTEXPR void set(csize_t<index>, const value_type& s) noexcept { *this = vec(static_cast<const base&>(*this)) - .shuffle(s, csizeseq_t<N>() + - (csizeseq_t<N>() >= csize_t<index * Nin>() && - csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) * - N); + .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * Nin>() && + csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) * + N); } struct element { @@ -314,7 +327,7 @@ template <typename T, size_t N> struct is_vec_impl<vec<T, N>> : std::true_type { }; -} +} // namespace internal template <typename T> using is_vec = internal::is_vec_impl<T>; @@ -365,7 +378,7 @@ constexpr swiz<12> s12{}; constexpr swiz<13> s13{}; constexpr swiz<14> s14{}; constexpr swiz<15> s15{}; -} +} // namespace swizzle #endif CMT_PRAGMA_GNU(GCC diagnostic push) @@ -425,7 +438,7 @@ struct conversion<vec<To, N>, From> static_assert(std::is_convertible<From, To>::value, ""); static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); } }; -} +} // namespace internal template <typename T> constexpr size_t size_of() noexcept @@ -524,7 +537,7 @@ struct shuffle_index_wrap { constexpr inline size_t operator()(size_t index) const { return (start + index * stride) % count; } }; -} +} // namespace internal template <size_t count, typename T, size_t N, size_t Nout = N* count> CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x) @@ -585,7 +598,7 @@ CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, co const T list[] = { static_cast<T>(args)... }; return vec<T, N>(list[indices]...); } -} +} // namespace internal /// Create vector from scalar values /// @code @@ -710,7 +723,7 @@ constexpr CMT_INLINE vec<T1, N> operator^(const T1& x, const vec<T1, N>& y) { return static_cast<vec<T1, N>>(x) ^ y; } -} +} // namespace operators using namespace operators; @@ -739,7 +752,7 @@ constexpr vec<T, Nout> partial_mask() { return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>()); } -} +} // namespace internal template <typename T> using optvec = vec<T, platform<T>::vector_capacity / 4>; @@ -751,6 +764,7 @@ using f32x4 = vec<f32, 4>; using f32x8 = vec<f32, 8>; using f32x16 = vec<f32, 16>; using f32x32 = vec<f32, 32>; +using f32x64 = vec<f32, 64>; using f64x1 = vec<f64, 1>; using f64x2 = vec<f64, 2>; using f64x3 = vec<f64, 3>; @@ -758,6 +772,7 @@ using f64x4 = vec<f64, 4>; using f64x8 = vec<f64, 8>; using f64x16 = vec<f64, 16>; using f64x32 = vec<f64, 32>; +using f64x64 = vec<f64, 64>; using i8x1 = vec<i8, 1>; using i8x2 = vec<i8, 2>; using i8x3 = vec<i8, 3>; @@ -765,6 +780,7 @@ using i8x4 = vec<i8, 4>; using i8x8 = vec<i8, 8>; using i8x16 = vec<i8, 16>; using i8x32 = vec<i8, 32>; +using i8x64 = vec<i8, 64>; using i16x1 = vec<i16, 1>; using i16x2 = vec<i16, 2>; using i16x3 = vec<i16, 3>; @@ -772,6 +788,7 @@ using i16x4 = vec<i16, 4>; using i16x8 = vec<i16, 8>; using i16x16 = vec<i16, 16>; using i16x32 = vec<i16, 32>; +using i16x64 = vec<i16, 64>; using i32x1 = vec<i32, 1>; using i32x2 = vec<i32, 2>; using i32x3 = vec<i32, 3>; @@ -779,6 +796,7 @@ using i32x4 = vec<i32, 4>; using i32x8 = vec<i32, 8>; using i32x16 = vec<i32, 16>; using i32x32 = vec<i32, 32>; +using i32x64 = vec<i32, 64>; using i64x1 = vec<i64, 1>; using i64x2 = vec<i64, 2>; using i64x3 = vec<i64, 3>; @@ -786,6 +804,7 @@ using i64x4 = vec<i64, 4>; using i64x8 = vec<i64, 8>; using i64x16 = vec<i64, 16>; using i64x32 = vec<i64, 32>; +using i64x64 = vec<i64, 64>; using u8x1 = vec<u8, 1>; using u8x2 = vec<u8, 2>; using u8x3 = vec<u8, 3>; @@ -793,6 +812,7 @@ using u8x4 = vec<u8, 4>; using u8x8 = vec<u8, 8>; using u8x16 = vec<u8, 16>; using u8x32 = vec<u8, 32>; +using u8x64 = vec<u8, 64>; using u16x1 = vec<u16, 1>; using u16x2 = vec<u16, 2>; using u16x3 = vec<u16, 3>; @@ -800,6 +820,7 @@ using u16x4 = vec<u16, 4>; using u16x8 = vec<u16, 8>; using u16x16 = vec<u16, 16>; using u16x32 = vec<u16, 32>; +using u16x64 = vec<u16, 64>; using u32x1 = vec<u32, 1>; using u32x2 = vec<u32, 2>; using u32x3 = vec<u32, 3>; @@ -807,6 +828,7 @@ using u32x4 = vec<u32, 4>; using u32x8 = vec<u32, 8>; using u32x16 = vec<u32, 16>; using u32x32 = vec<u32, 32>; +using u32x64 = vec<u32, 64>; using u64x1 = vec<u64, 1>; using u64x2 = vec<u64, 2>; using u64x3 = vec<u64, 3>; @@ -814,6 +836,7 @@ using u64x4 = vec<u64, 4>; using u64x8 = vec<u64, 8>; using u64x16 = vec<u64, 16>; using u64x32 = vec<u64, 32>; +using u64x64 = vec<u64, 64>; using u8x2x2 = vec<vec<u8, 2>, 2>; using i8x2x2 = vec<vec<i8, 2>, 2>; @@ -851,7 +874,7 @@ using ivec4 = i32x4; using uvec2 = u32x2; using uvec3 = u32x3; using uvec4 = u32x4; -} +} // namespace glsl_names namespace opencl_names { using char2 = i8x2; @@ -909,7 +932,7 @@ using double3 = f64x3; using double4 = f64x4; using double8 = f64x8; using double16 = f64x16; -} +} // namespace opencl_names namespace internal { @@ -958,7 +981,7 @@ constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>) { return make_vector(((void)Indices, void(), fn())...); } -} +} // namespace internal template <typename T, size_t N, typename Fn, typename... Args, typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>> @@ -1048,7 +1071,7 @@ CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>) } KFR_FN(low) KFR_FN(high) -} +} // namespace kfr namespace cometa { @@ -1105,7 +1128,7 @@ struct compound_type_traits<kfr::mask<T, N>> return value[index]; } }; -} +} // namespace cometa namespace std { @@ -1140,7 +1163,7 @@ struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>> { using type = kfr::mask<typename common_type<T1, T2>::type, N>; }; -} +} // namespace std CMT_PRAGMA_GNU(GCC diagnostic pop) CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -20,6 +20,8 @@ extern char* gets(char* __s); #define CMT_ARCH_X32 1 #endif +#ifndef CMT_FORCE_GENERIC_CPU + #if defined __AVX512F__ && !defined CMT_ARCH_AVX512 #define CMT_ARCH_AVX512 1 #define CMT_ARCH_AVX2 1 @@ -107,6 +109,8 @@ extern char* gets(char* __s); #define CMT_ARCH_LZCNT 1 #endif +#endif // CMT_FORCE_GENERIC_CPU + #if defined CMT_ARCH_AVX512 #define CMT_ARCH_NAME avx512 #elif defined CMT_ARCH_AVX2 diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp @@ -26,6 +26,7 @@ #pragma once #include "../base/complex.hpp" +#include "../base/filter.hpp" #include "../base/constants.hpp" #include "../base/memory.hpp" #include "../base/read_write.hpp" @@ -42,88 +43,41 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") namespace kfr { +namespace internal +{ +template <typename T> +univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2); +template <typename T> +univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2); +template <typename T> +univector<T> autocorrelate(const univector_ref<const T>& src1); +} // namespace internal + template <typename T, size_t Tag1, size_t Tag2> -CMT_FUNC univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2) +univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2) { - const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); - univector<complex<T>> src1padded = src1; - univector<complex<T>> src2padded = src2; - src1padded.resize(size, 0); - src2padded.resize(size, 0); - - dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size); - univector<u8> temp(dft->temp_size); - dft->execute(src1padded, src1padded, temp); - dft->execute(src2padded, src2padded, temp); - src1padded = src1padded * src2padded; - dft->execute(src1padded, src1padded, temp, true); - const T invsize = reciprocal<T>(size); - return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; + return internal::convolve(src1.slice(), src2.slice()); } template <typename T, size_t Tag1, size_t Tag2> -CMT_FUNC univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2) +univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2) { - const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); - univector<complex<T>> src1padded = src1; - univector<complex<T>> src2padded = reverse(src2); - src1padded.resize(size, 0); - src2padded.resize(size, 0); - dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size); - univector<u8> temp(dft->temp_size); - dft->execute(src1padded, src1padded, temp); - dft->execute(src2padded, src2padded, temp); - src1padded = src1padded * src2padded; - dft->execute(src1padded, src1padded, temp, true); - const T invsize = reciprocal<T>(size); - return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; + return internal::correlate(src1.slice(), src2.slice()); } template <typename T, size_t Tag1> -CMT_FUNC univector<T> autocorrelate(const univector<T, Tag1>& src) +univector<T> autocorrelate(const univector<T, Tag1>& src) { - univector<T> result = correlate(src, src); - result = result.slice(result.size() / 2); - return result; + return internal::autocorrelate(src.slice()); } template <typename T> class convolve_filter : public filter<T> { public: - explicit convolve_filter(size_t size, size_t block_size = 1024) - : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size), - segments((size + block_size - 1) / block_size) - { - } - explicit convolve_filter(const univector<T>& data, size_t block_size = 1024) - : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)), - temp(fft.temp_size), - segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), - ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), - input_position(0), position(0) - { - set_data(data); - } - void set_data(const univector<T>& data) - { - univector<T> input(fft.size); - const T ifftsize = reciprocal(T(fft.size)); - for (size_t i = 0; i < ir_segments.size(); i++) - { - segments[i].resize(block_size); - ir_segments[i].resize(block_size, 0); - input = padded(data.slice(i * block_size, block_size)); - - fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm); - process(ir_segments[i], ir_segments[i] * ifftsize); - } - saved_input.resize(block_size, 0); - scratch.resize(block_size * 2); - premul.resize(block_size, 0); - cscratch.resize(block_size); - overlap.resize(block_size, 0); - } + explicit convolve_filter(size_t size, size_t block_size = 1024); + explicit convolve_filter(const univector<T>& data, size_t block_size = 1024); + void set_data(const univector<T>& data); protected: void process_expression(T* dest, const expression_pointer<T>& src, size_t size) final @@ -131,49 +85,7 @@ protected: univector<T> input = truncate(src, size); process_buffer(dest, input.data(), input.size()); } - void process_buffer(T* output, const T* input, size_t size) final - { - size_t processed = 0; - while (processed < size) - { - const size_t processing = std::min(size - processed, block_size - input_position); - internal::builtin_memcpy(saved_input.data() + input_position, input + processed, - processing * sizeof(T)); - - process(scratch, padded(saved_input)); - fft.execute(segments[position], scratch, temp, dft_pack_format::Perm); - - if (input_position == 0) - { - process(premul, zeros()); - for (size_t i = 1; i < segments.size(); i++) - { - const size_t n = (position + i) % segments.size(); - fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm); - } - } - fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], - dft_pack_format::Perm); - - fft.execute(scratch, cscratch, temp, dft_pack_format::Perm); - - process(make_univector(output + processed, processing), - scratch.slice(input_position) + overlap.slice(input_position)); - - input_position += processing; - if (input_position == block_size) - { - input_position = 0; - process(saved_input, zeros()); - - internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T)); - - position = position > 0 ? position - 1 : segments.size() - 1; - } - - processed += processing; - } - } + void process_buffer(T* output, const T* input, size_t size) final; const dft_plan_real<T> fft; univector<u8> temp; @@ -189,5 +101,6 @@ protected: univector<T> overlap; size_t position; }; -} + +} // namespace kfr CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/dft/dft-src.cpp b/include/kfr/dft/dft-src.cpp @@ -24,10 +24,13 @@ See https://www.kfrlib.com for details. */ +#include "../base/basic_expressions.hpp" +#include "../testo/assert.hpp" #include "bitrev.hpp" +#include "cache.hpp" +#include "convolution.hpp" #include "fft.hpp" #include "ft.hpp" -#include "../testo/assert.hpp" CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wshadow") @@ -248,7 +251,7 @@ template <typename T, size_t width> CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format) { const size_t count = stage_size / 4; - size_t nnstep = size / stage_size; + size_t nnstep = size / stage_size; DFT_ASSERT(width <= count); CMT_LOOP_NOUNROLL for (size_t n = 0; n < count; n += width) @@ -259,7 +262,7 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s } } -#ifdef CMT_ARCH_X86 +#if defined CMT_ARCH_SSE #ifdef CMT_COMPILER_GNU #define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0); #else @@ -484,11 +487,11 @@ struct fft_final_stage_impl : dft_stage<T> } protected: - constexpr static size_t width = fft_vector_width<T>; - constexpr static bool is_even = cometa::is_even(ilog2(size)); - constexpr static bool use_br2 = !is_even; - constexpr static bool aligned = false; - constexpr static bool prefetch = splitin; + constexpr static size_t width = fft_vector_width<T>; + constexpr static bool is_even = cometa::is_even(ilog2(size)); + constexpr static bool use_br2 = !is_even; + constexpr static bool aligned = false; + constexpr static bool prefetch = splitin; KFR_INTRIN void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {} KFR_INTRIN void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {} @@ -510,7 +513,7 @@ protected: virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override { - const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); final_stage(csize<size>, 1, cbool<splitin>, out, in, twiddle); } @@ -1064,6 +1067,164 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_ template <typename T> dft_plan<T>::~dft_plan() = default; +namespace internal +{ + +template <typename T> +univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2) +{ + const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); + univector<complex<T>> src1padded = src1; + univector<complex<T>> src2padded = src2; + src1padded.resize(size, 0); + src2padded.resize(size, 0); + + dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size); + univector<u8> temp(dft->temp_size); + dft->execute(src1padded, src1padded, temp); + dft->execute(src2padded, src2padded, temp); + src1padded = src1padded * src2padded; + dft->execute(src1padded, src1padded, temp, true); + const T invsize = reciprocal<T>(size); + return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; +} + +template <typename T> +univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2) +{ + const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); + univector<complex<T>> src1padded = src1; + univector<complex<T>> src2padded = reverse(src2); + src1padded.resize(size, 0); + src2padded.resize(size, 0); + dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size); + univector<u8> temp(dft->temp_size); + dft->execute(src1padded, src1padded, temp); + dft->execute(src2padded, src2padded, temp); + src1padded = src1padded * src2padded; + dft->execute(src1padded, src1padded, temp, true); + const T invsize = reciprocal<T>(size); + return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; +} + +template <typename T> +univector<T> autocorrelate(const univector_ref<const T>& src1) +{ + return internal::autocorrelate(src1.slice()); + univector<T> result = correlate(src1, src1); + result = result.slice(result.size() / 2); + return result; +} + +template univector<float> convolve<float>(const univector_ref<const float>&, + const univector_ref<const float>&); +template univector<double> convolve<double>(const univector_ref<const double>&, + const univector_ref<const double>&); +template univector<float> correlate<float>(const univector_ref<const float>&, + const univector_ref<const float>&); +template univector<double> correlate<double>(const univector_ref<const double>&, + const univector_ref<const double>&); + +template univector<float> autocorrelate<float>(const univector_ref<const float>&); +template univector<double> autocorrelate<double>(const univector_ref<const double>&); + +} // namespace internal + +template <typename T> +convolve_filter<T>::convolve_filter(size_t size, size_t block_size) + : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size), + segments((size + block_size - 1) / block_size) +{ +} + +template <typename T> +convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size) + : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)), + temp(fft.temp_size), + segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), + ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), + input_position(0), position(0) +{ + set_data(data); +} + +template <typename T> +void convolve_filter<T>::set_data(const univector<T>& data) +{ + univector<T> input(fft.size); + const T ifftsize = reciprocal(T(fft.size)); + for (size_t i = 0; i < ir_segments.size(); i++) + { + segments[i].resize(block_size); + ir_segments[i].resize(block_size, 0); + input = padded(data.slice(i * block_size, block_size)); + + fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm); + process(ir_segments[i], ir_segments[i] * ifftsize); + } + saved_input.resize(block_size, 0); + scratch.resize(block_size * 2); + premul.resize(block_size, 0); + cscratch.resize(block_size); + overlap.resize(block_size, 0); +} + +template <typename T> +void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) +{ + size_t processed = 0; + while (processed < size) + { + const size_t processing = std::min(size - processed, block_size - input_position); + internal::builtin_memcpy(saved_input.data() + input_position, input + processed, + processing * sizeof(T)); + + process(scratch, padded(saved_input)); + fft.execute(segments[position], scratch, temp, dft_pack_format::Perm); + + if (input_position == 0) + { + process(premul, zeros()); + for (size_t i = 1; i < segments.size(); i++) + { + const size_t n = (position + i) % segments.size(); + fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm); + } + } + fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm); + + fft.execute(scratch, cscratch, temp, dft_pack_format::Perm); + + process(make_univector(output + processed, processing), + scratch.slice(input_position) + overlap.slice(input_position)); + + input_position += processing; + if (input_position == block_size) + { + input_position = 0; + process(saved_input, zeros()); + + internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T)); + + position = position > 0 ? position - 1 : segments.size() - 1; + } + + processed += processing; + } +} + +template convolve_filter<float>::convolve_filter(size_t, size_t); +template convolve_filter<double>::convolve_filter(size_t, size_t); + +template convolve_filter<float>::convolve_filter(const univector<float>&, size_t); +template convolve_filter<double>::convolve_filter(const univector<double>&, size_t); + +template void convolve_filter<float>::set_data(const univector<float>&); +template void convolve_filter<double>::set_data(const univector<double>&); + +template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size); +template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size); + template dft_plan<float>::dft_plan(size_t, cbools_t<false, true>); template dft_plan<float>::dft_plan(size_t, cbools_t<true, false>); template dft_plan<float>::dft_plan(size_t, cbools_t<true, true>); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -15,7 +15,7 @@ # along with KFR. -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.0) set(TEST_SRC ) @@ -38,16 +38,65 @@ endif () find_package(MPFR) +set(ALL_TESTS_CPP + all_tests.cpp + base_test.cpp + complex_test.cpp + dft_test.cpp + dsp_test.cpp + expression_test.cpp + intrinsic_test.cpp) + +if (MPFR_FOUND) + list(APPEND ALL_TESTS_CPP transcendental_test.cpp) +endif () + +add_executable(all_tests ${ALL_TESTS_CPP} ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp) +target_compile_definitions(all_tests PRIVATE KFR_NO_MAIN) + add_executable(intrinsic_test intrinsic_test.cpp ${KFR_SRC} ${TEST_SRC}) add_executable(dft_test dft_test.cpp ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp) if (MPFR_FOUND) + add_definitions(-DHAVE_MPFR) include_directories(${MPFR_INCLUDE_DIR}) add_executable(transcendental_test transcendental_test.cpp ${KFR_SRC} ${TEST_SRC}) + target_link_libraries(all_tests ${MPFR_LIBRARIES}) target_link_libraries(transcendental_test ${MPFR_LIBRARIES}) if (WIN32) target_link_libraries(transcendental_test gmp) # for MSYS + target_link_libraries(all_tests gmp) # for MSYS endif() endif () + +function(add_x86_test NAME FLAGS) + separate_arguments(FLAGS) + add_executable(all_tests_${NAME} ${ALL_TESTS_CPP} ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp) + target_compile_options(all_tests_${NAME} PRIVATE ${FLAGS}) + target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN) + if(MPFR_FOUND) + target_link_libraries(all_tests_${NAME} ${MPFR_LIBRARIES}) + if (WIN32) + target_link_libraries(all_tests_${NAME} gmp) # for MSYS + endif() + endif() +endfunction() + +message(STATUS CMAKE_SYSTEM_PROCESSOR = ${CMAKE_SYSTEM_PROCESSOR}) + +set(ARCH_TESTS 0) + +if (CLANG AND CMAKE_SYSTEM_PROCESSOR MATCHES "([aA][mM][dD]64)") + set(ARCH_TESTS 1) + add_x86_test(generic "-march=x86-64 -mno-avx -DCMT_FORCE_GENERIC_CPU") + add_x86_test(sse2 "-march=x86-64 -msse2") + add_x86_test(sse3 "-march=x86-64 -msse3 -mno-avx") + add_x86_test(ssse3 "-march=x86-64 -mssse3 -mno-avx") + add_x86_test(sse41 "-march=x86-64 -msse4.1 -mno-avx") + add_x86_test(avx "-march=x86-64 -msse4.1 -mavx") + add_x86_test(avx2 "-march=x86-64 -msse4.1 -mavx2 -mfma") + add_x86_test(avx512 "-march=x86-64 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl") +endif() + add_executable(dsp_test dsp_test.cpp ${KFR_SRC} ${TEST_SRC}) add_executable(empty_test empty_test.cpp ${KFR_SRC} ${TEST_SRC}) add_executable(complex_test complex_test.cpp ${KFR_SRC} ${TEST_SRC}) @@ -56,7 +105,11 @@ add_executable(expression_test expression_test.cpp ${KFR_SRC} ${TEST_SRC}) add_executable(ebu_test ebu_test.cpp ${KFR_SRC} ${TEST_SRC}) -if (ARM) +if(USE_SDE) + find_program(EMULATOR "sde") + list(APPEND EMULATOR "-skx") + list(APPEND EMULATOR "--") +elseif (ARM) find_program(EMULATOR "qemu-arm") else () set(EMULATOR "") @@ -86,4 +139,15 @@ if (NOT IOS) endif () add_test(NAME dft_test COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dft_test) + + if(ARCH_TESTS) + add_test(NAME generic COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_generic ) + add_test(NAME sse2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse2 ) + add_test(NAME sse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse3 ) + add_test(NAME ssse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_ssse3 ) + add_test(NAME sse41 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse41 ) + add_test(NAME avx COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx ) + add_test(NAME avx2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx2 ) + add_test(NAME avx512 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx512 ) + endif() endif () diff --git a/tests/all_tests.cpp b/tests/all_tests.cpp @@ -0,0 +1,22 @@ +#include <kfr/io/tostring.hpp> +#include <kfr/testo/testo.hpp> +#include <kfr/version.hpp> +#ifdef HAVE_MPFR +#include "mpfr/mpfrplus.hpp" +#endif + +using namespace kfr; + +int main() +{ + println(library_version(), " running on ", cpu_runtime()); + if (get_cpu() < cpu_t::native) + { + println("CPU is not supported"); + return -1; + } +#ifdef HAVE_MPFR + mpfr::scoped_precision p(128); +#endif + return testo::run_all(""); +} diff --git a/tests/base_test.cpp b/tests/base_test.cpp @@ -377,8 +377,10 @@ TEST(test_stat) } } +#ifndef KFR_NO_MAIN int main() { println(library_version()); return testo::run_all("", true); } +#endif diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp @@ -210,9 +210,11 @@ TEST(static_tests) testo::assert_is_same<common_type<complex<int>, double>, complex<double>>(); } +#ifndef KFR_NO_MAIN int main() { println(library_version()); return testo::run_all("", true); } +#endif diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -14,9 +14,9 @@ using namespace kfr; #ifdef KFR_NATIVE_F64 -constexpr ctypes_t<float, double> float_types{}; +constexpr ctypes_t<float, double> dft_float_types{}; #else -constexpr ctypes_t<float> float_types{}; +constexpr ctypes_t<float> dft_float_types{}; #endif TEST(test_convolve) @@ -62,7 +62,7 @@ TEST(fft_real) kfr::univector<float_type, size> in = gen_random_range<float_type>(gen, -1.0, +1.0); kfr::univector<kfr::complex<float_type>, size / 2 + 1> out = realdft(in); - kfr::univector<float_type, size> rev = irealdft(out) / size; + kfr::univector<float_type, size> rev = irealdft(out) / size; CHECK(rms(rev - in) <= 0.00001f); } @@ -71,7 +71,7 @@ TEST(fft_accuracy) testo::active_test()->show_progress = true; random_bit_generator gen(2247448713, 915890490, 864203735, 2982561); - testo::matrix(named("type") = float_types, // + testo::matrix(named("type") = dft_float_types, // named("inverse") = std::make_tuple(false, true), // named("log2(size)") = make_range(size_t(1), stopsize), // [&gen](auto type, bool inverse, size_t log2size) { @@ -115,16 +115,18 @@ TEST(fft_accuracy) univector<float_type> out2(size, 0.f); dft.execute(out2, out, temp); - out2 = out2 / size; + out2 = out2 / size; const float_type rms_diff_r2 = rms(in - out2); CHECK(rms_diff_r2 < epsilon * ops); } }); } +#ifndef KFR_NO_MAIN int main() { println(library_version(), " running on ", cpu_runtime()); return testo::run_all("", true); } +#endif diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp @@ -306,8 +306,10 @@ TEST(fir) }); } +#ifndef KFR_NO_MAIN int main() { println(library_version()); return testo::run_all("", true); } +#endif diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp @@ -146,9 +146,11 @@ TEST(partition) } } +#ifndef KFR_NO_MAIN int main() { println(library_version()); return testo::run_all("", true); } +#endif diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp @@ -12,8 +12,8 @@ using namespace kfr; -constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, // - i16x1, i16x2, i16x4, i16x8, i16x16, i16x3, // +constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64, i8x3, // + i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x3, // i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 // #ifdef KFR_NATIVE_I64 , @@ -22,8 +22,8 @@ constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, // > signed_types{}; -constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x3, // - u16x1, u16x2, u16x4, u16x8, u16x16, u16x3, // +constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64, u8x3, // + u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x3, // u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 // #ifdef KFR_NATIVE_I64 , @@ -127,6 +127,16 @@ inline T ref_satsub(T x, T y) return result; } +TEST(intrin_select) +{ + testo::matrix(named("type") = cconcat(signed_types, cconcat(unsigned_types, float_types)), [](auto type) { + using Tvec = type_of<decltype(type)>; + using T = subtype<Tvec>; + CHECK(kfr::select(make_mask<T>(false), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(2)); + CHECK(kfr::select(make_mask<T>(true), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(1)); + }); +} + TEST(intrin_abs) { testo::assert_is_same<decltype(kfr::abs(1)), int>(); @@ -387,4 +397,10 @@ TEST(intrin_math) CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233))); } -int main() { return testo::run_all("", false); } +#ifndef KFR_NO_MAIN +int main() +{ + println(library_version(), " running on ", cpu_runtime()); + return testo::run_all("", false); +} +#endif diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp @@ -14,6 +14,8 @@ using namespace kfr; +using vector_types = ctypes_t<f32, f64, f32x2, f32x8, f32x16, f64x2, f64x4, f64x8>; + template <typename T> double ulps(T test, const mpfr::number& ref) { @@ -25,15 +27,72 @@ double ulps(T test, const mpfr::number& ref) mpfr::abs(mpfr::number(test) - std::nexttoward(test, HUGE_VALL))); } +template <typename T, size_t N> +double ulps(const vec<T, N>& test, const mpfr::number& ref) +{ + double u = 0; + for (size_t i = 0; i < N; ++i) + u = std::max(u, ulps(test[i], ref)); + return u; +} + TEST(test_sin_cos) { - testo::matrix(named("type") = ctypes_t<f32, f64>(), + testo::matrix(named("type") = vector_types(), named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05), [](auto type, double value) { using T = type_of<decltype(type)>; const T x(value); - CHECK(ulps(kfr::sin(x), mpfr::sin(x)) < 2.0); - CHECK(ulps(kfr::cos(x), mpfr::cos(x)) < 2.0); + CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0); + CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0); + }); + testo::matrix(named("type") = vector_types(), named("value") = make_range(-100.0, 100.0, 0.5), + [](auto type, double value) { + using T = type_of<decltype(type)>; + const T x(value); + CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0); + CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0); + }); +} + +TEST(test_tan) +{ + testo::matrix(named("type") = ctypes_t<f32>(), + named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.01), + [](auto type, double value) { + using T = type_of<decltype(type)>; + const T x(value); + CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 2.0); + }); + testo::matrix(named("type") = ctypes_t<f32>(), named("value") = make_range(-100.0, 100.0, 0.5), + [](auto type, double value) { + using T = type_of<decltype(type)>; + const T x(value); + CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 3.0); + }); +} + +TEST(test_asin_acos_atan) +{ + testo::matrix(named("type") = vector_types(), named("value") = make_range(-1.0, 1.0, 0.05), + [](auto type, double value) { + using T = type_of<decltype(type)>; + const T x(value); + CHECK(ulps(kfr::asin(x), mpfr::asin(subtype<T>(value))) < 2.0); + CHECK(ulps(kfr::acos(x), mpfr::acos(subtype<T>(value))) < 2.0); + CHECK(ulps(kfr::atan(x), mpfr::atan(subtype<T>(value))) < 2.0); + }); +} + +TEST(test_atan2) +{ + testo::matrix(named("type") = vector_types(), named("value1") = make_range(-1.0, 1.0, 0.1), + named("value2") = make_range(-1.0, 1.0, 0.1), [](auto type, double value1, double value2) { + using T = type_of<decltype(type)>; + const T x(value1); + const T y(value2); + CHECK(ulps(kfr::atan2(x, y), mpfr::atan2(subtype<T>(value1), subtype<T>(value2))) < + 2.0); }); } @@ -97,8 +156,11 @@ TEST(test_exp10) }); } +#ifndef KFR_NO_MAIN int main() { + println(library_version(), " running on ", cpu_runtime()); mpfr::scoped_precision p(128); return testo::run_all(""); } +#endif