commit 51f526fdde45132c83c9c6368a770aed291be3f2
parent 509d0251987260ef1f7845592147120e1158b75b
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Mon, 19 Nov 2018 05:41:10 +0300
KFR 3.0 fixes/tests
Diffstat:
24 files changed, 484 insertions(+), 205 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,13 +15,7 @@
# along with KFR.
-cmake_minimum_required(VERSION 2.8)
-
-if (CMAKE_VERSION VERSION_LESS "2.8.12")
- function(add_compile_options)
- add_definitions(${ARGN})
- endfunction(add_compile_options)
-endif ()
+cmake_minimum_required(VERSION 3.0)
#set(DISABLE_CLANG 1)
@@ -91,7 +85,7 @@ if (${CMAKE_GENERATOR} STREQUAL "MinGW Makefiles"
OR ${CMAKE_GENERATOR} STREQUAL "MSYS Makefiles"
OR ${CMAKE_GENERATOR} STREQUAL "Unix Makefiles"
OR ${CMAKE_GENERATOR} STREQUAL "Ninja")
-
+
if (CMAKE_CXX_COMPILER MATCHES "clang" AND NOT CMAKE_CXX_COMPILER MATCHES "clang-cl")
if (WIN32)
# On windows, clang requires --target
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -15,7 +15,7 @@
# along with KFR.
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
include_directories(../include)
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -114,8 +114,8 @@ KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
r = mulsign(r, x);
r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
- r = select(y == 0.0f, (x < 0) & pi, r);
- r = (isnan(x) || isnan(y)) | mulsign(r, y);
+ r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r);
+ r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
return r;
}
@@ -129,8 +129,8 @@ KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
r = mulsign(r, x);
r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
- r = select(y == 0.0, (x < 0) & pi, r);
- r = (isnan(x) || isnan(y)) | mulsign(r, y);
+ r = select(y == 0.0, select(x < 0., pi, 0.), r);
+ r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
return r;
}
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -27,6 +27,7 @@
#include "univector.hpp"
#include "vec.hpp"
+#include "operators.hpp"
#include <algorithm>
namespace kfr
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -37,7 +37,8 @@ namespace intrinsics
template <size_t bits>
struct bitmask
{
- using type = findinttype<0, (1ull << bits) - 1>;
+ using type = conditional<(bits > 32), uint64_t,
+ conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>;
bitmask(type val) : value(val) {}
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -82,11 +82,14 @@ inline vec<u8, N> random_bits(random_bit_generator& gen)
{
return narrow<N>(bitcast<u8>(gen()));
}
+
template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
inline vec<u8, N> random_bits(random_bit_generator& gen)
{
- constexpr size_t N2 = prev_poweroftwo(N - 1);
- return concat(random_bits<N2>(gen), random_bits<N - N2>(gen));
+ constexpr size_t N2 = prev_poweroftwo(N - 1);
+ const vec<u8, N2> bits1 = random_bits<N2>(gen);
+ const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen);
+ return concat(bits1, bits2);
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_integral<T>::value)>
@@ -162,7 +165,7 @@ struct expression_random_range : input_expression
const T min;
const T max;
};
-}
+} // namespace internal
template <typename T>
inline internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen)
@@ -187,4 +190,4 @@ inline internal::expression_random_range<T> gen_random_range(T min, T max)
{
return internal::expression_random_range<T>(random_bit_generator(seed_from_rdtsc), min, max);
}
-}
+} // namespace kfr
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -116,25 +116,25 @@ template <size_t N>
KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x)
{
vec<f32, N> t = cast<f32>(cast<i32>(x));
- return t - ((x < t) & 1.f);
+ return t - select(x < t, 1.f, 0.f);
}
template <size_t N>
KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x)
{
vec<f64, N> t = cast<f64>(cast<i64>(x));
- return t - ((x < t) & 1.0);
+ return t - select(x < t, 1., 0.);
}
template <size_t N>
KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x)
{
vec<f32, N> t = cast<f32>(cast<i32>(x));
- return t + ((x > t) & 1.f);
+ return t + select(x > t, 1.f, 0.f);
}
template <size_t N>
KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x)
{
vec<f64, N> t = cast<f64>(cast<i64>(x));
- return t + ((x > t) & 1.0);
+ return t + select(x > t, 1., 0.);
}
template <size_t N>
KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x)
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -167,7 +167,7 @@ KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, co
template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
- return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
+ return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
diff --git a/include/kfr/base/simd_clang.hpp b/include/kfr/base/simd_clang.hpp
@@ -189,7 +189,7 @@ struct alignas(alignof(internal::simd_type<T, N>)) vec : public vec_t<T, N>
constexpr friend mask_t operator<=(const vec& x, const vec& y) noexcept { return KFR_S(*x <= *y); }
constexpr friend mask_t operator>=(const vec& x, const vec& y) noexcept { return KFR_S(*x >= *y); }
- constexpr mask_t asmask() const noexcept { return mask_t(*this); }
+ constexpr mask_t asmask() const noexcept { return mask_t(simd); }
#undef KFR_S
#undef KFR_U
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -61,14 +61,14 @@ template <typename T, size_t N, typename Tprecise = f64>
KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
{
const vec<T, N> xabs = abs(x);
- constexpr T div = constants<T>::fold_constant_div;
- vec<T, N> y = floor(xabs / div);
- quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
+ constexpr T div = constants<T>::fold_constant_div;
+ vec<T, N> y = floor(xabs / div);
+ quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
const mask<T, N> msk = (quadrant & 1) != 0;
- quadrant = kfr::select(msk, quadrant + 1, quadrant);
- y = select(msk, y + T(1.0), y);
- quadrant = quadrant & 7;
+ quadrant = kfr::select(msk, quadrant + 1, quadrant);
+ y = select(msk, y + T(1.0), y);
+ quadrant = quadrant & 7;
constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi);
constexpr T rem1 = constants<T>::fold_constant_rem1;
@@ -134,10 +134,11 @@ KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cos
vec<itype<T>, N> quadrant;
vec<T, N> folded = trig_fold(x_full, quadrant);
- mask<T, N> flip_sign = kfr::select(cosmask, (quadrant == 2) || (quadrant == 4), quadrant >= 4);
+ mask<T, N> flip_sign =
+ kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask();
mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
- usecos = usecos ^ cosmask;
+ usecos = usecos ^ cosmask;
vec<T, N> formula = trig_sincos(folded, usecos);
@@ -160,7 +161,7 @@ KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x)
vec<T, N> formula = trig_sincos(folded, usecos);
- formula = select(flip_sign ^ x, -formula, formula);
+ formula = select(flip_sign ^ mask<T, N>(x), -formula, formula);
return formula;
}
@@ -193,15 +194,15 @@ KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x)
vec<T, N> xx = x - pi;
vec<T, N> y = abs(xx);
- y = select(y > c_pi<T, 1, 2>, pi - y, y);
- y = y ^ (msk & ~xx);
+ y = select(y > c_pi<T, 1, 2>, pi - y, y);
+ y = y ^ (msk & ~xx);
vec<T, N> y2 = y * y;
vec<T, N> formula = c6;
vec<T, N> y3 = y2 * y;
- formula = fmadd(formula, y2, c4);
- formula = fmadd(formula, y2, c2);
- formula = formula * y3 + y;
+ formula = fmadd(formula, y2, c4);
+ formula = fmadd(formula, y2, c2);
+ formula = formula * y3 + y;
return formula;
}
@@ -316,7 +317,7 @@ KFR_SINTRIN Tout cossindeg(const T& x)
{
return cossin(x * constants<Tout>::degtorad);
}
-}
+} // namespace intrinsics
KFR_I_FN(sin)
KFR_I_FN(cos)
@@ -616,4 +617,4 @@ KFR_SINTRIN T cos3x(const T& sinx, const T& cosx)
{
return cosx * (1 - 4 * sqr(sinx));
}
-}
+} // namespace kfr
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -51,10 +51,10 @@ KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inve
vec<T, N> x = y - k_real * pi_14;
mask<T, N> need_offset = (k & 1) != 0;
- x = select(need_offset, x - pi_14, x);
+ x = select(need_offset, x - pi_14, x);
vec<IT, N> k_mod4 = k & 3;
- inverse = (k_mod4 == 1) || (k_mod4 == 2);
+ inverse = (k_mod4 == 1) || (k_mod4 == 2);
return x;
}
@@ -62,7 +62,9 @@ template <size_t N>
KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full)
{
mask<f32, N> inverse;
- const vec<f32, N> x = trig_fold_simple(x_full, inverse);
+ vec<i32, N> quad;
+ const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+ inverse = quad == 2 || quad == 6;
constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
@@ -90,7 +92,9 @@ template <size_t N>
KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full)
{
mask<f64, N> inverse;
- const vec<f64, N> x = trig_fold_simple(x_full, inverse);
+ vec<i64, N> quad;
+ const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+ inverse = quad == 2 || quad == 6;
constexpr f64 tan_c2 = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01);
constexpr f64 tan_c4 = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01);
@@ -129,7 +133,7 @@ KFR_SINTRIN flt_type<T> tandeg(const T& x)
{
return tan(x * c_degtorad<flt_type<T>>);
}
-}
+} // namespace intrinsics
KFR_I_FN(tan)
KFR_I_FN(tandeg)
@@ -156,4 +160,4 @@ KFR_FUNC internal::expression_function<fn::tandeg, E1> tandeg(E1&& x)
{
return { fn::tandeg(), std::forward<E1>(x) };
}
-}
+} // namespace kfr
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -37,7 +37,7 @@ template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
CMT_INLINE vec<T, Nout> low(const vec<T, N>& x);
template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
CMT_INLINE vec<T, Nout> high(const vec<T, N>& x);
-}
+} // namespace kfr
#ifdef CMT_COMPILER_CLANG
#include "simd_clang.hpp"
@@ -65,7 +65,7 @@ template <typename T>
using maskfor = typename T::mask_t;
template <typename T, size_t N>
-struct mask : vec<T, N>
+struct mask : protected vec<T, N>
{
using base = vec<T, N>;
KFR_I_CE mask() noexcept = default;
@@ -73,17 +73,31 @@ struct mask : vec<T, N>
KFR_I_CE mask& operator=(const mask&) noexcept = default;
using simd_type = typename base::simd_type;
- KFR_I_CE mask(const base& v) noexcept : base(v) {}
+ simd_type operator*() const noexcept { return this->simd; }
+ simd_type& operator*() noexcept { return this->simd; }
+
+ KFR_I_CE mask(const base& v) noexcept
+ : base(base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()))
+ {
+ }
KFR_I_CE mask(const simd_type& simd) : base(simd) {}
template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
- KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m))
+ KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
{
}
template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
KFR_I_CE mask(const vec<U, N>& m) : base(base::frombits(m))
{
}
+ KFR_I_CE mask operator&(const mask& y) const noexcept
+ {
+ return static_cast<const base&>(*this) & static_cast<const base&>(y);
+ }
+ KFR_I_CE mask operator|(const mask& y) const noexcept
+ {
+ return static_cast<const base&>(*this) | static_cast<const base&>(y);
+ }
KFR_I_CE mask operator&&(const mask& y) const noexcept
{
return static_cast<const base&>(*this) & static_cast<const base&>(y);
@@ -96,7 +110,7 @@ struct mask : vec<T, N>
{
return static_cast<const base&>(*this) ^ static_cast<const base&>(y);
}
- KFR_I_CE mask operator^(const base& y) const noexcept { return static_cast<const base&>(*this) ^ y; }
+ KFR_I_CE mask operator~() const noexcept { return ~static_cast<const base&>(*this); }
bool operator[](size_t index) const noexcept;
@@ -123,7 +137,7 @@ constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...>
{
return {};
}
-}
+} // namespace internal
template <size_t groupsize, size_t... indices>
constexpr inline auto scale() noexcept
@@ -209,13 +223,13 @@ struct vec<vec<T, Nin>, N> : private vec<T, Nin * N>
constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
- constexpr friend vec operator++(vec& x, int)noexcept
+ constexpr friend vec operator++(vec& x, int) noexcept
{
const vec z = x;
++x;
return z;
}
- constexpr friend vec operator--(vec& x, int)noexcept
+ constexpr friend vec operator--(vec& x, int) noexcept
{
const vec z = x;
--x;
@@ -256,10 +270,9 @@ struct vec<vec<T, Nin>, N> : private vec<T, Nin * N>
CMT_GNU_CONSTEXPR void set(csize_t<index>, const value_type& s) noexcept
{
*this = vec(static_cast<const base&>(*this))
- .shuffle(s, csizeseq_t<N>() +
- (csizeseq_t<N>() >= csize_t<index * Nin>() &&
- csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) *
- N);
+ .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * Nin>() &&
+ csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) *
+ N);
}
struct element
{
@@ -314,7 +327,7 @@ template <typename T, size_t N>
struct is_vec_impl<vec<T, N>> : std::true_type
{
};
-}
+} // namespace internal
template <typename T>
using is_vec = internal::is_vec_impl<T>;
@@ -365,7 +378,7 @@ constexpr swiz<12> s12{};
constexpr swiz<13> s13{};
constexpr swiz<14> s14{};
constexpr swiz<15> s15{};
-}
+} // namespace swizzle
#endif
CMT_PRAGMA_GNU(GCC diagnostic push)
@@ -425,7 +438,7 @@ struct conversion<vec<To, N>, From>
static_assert(std::is_convertible<From, To>::value, "");
static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
};
-}
+} // namespace internal
template <typename T>
constexpr size_t size_of() noexcept
@@ -524,7 +537,7 @@ struct shuffle_index_wrap
{
constexpr inline size_t operator()(size_t index) const { return (start + index * stride) % count; }
};
-}
+} // namespace internal
template <size_t count, typename T, size_t N, size_t Nout = N* count>
CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
@@ -585,7 +598,7 @@ CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, co
const T list[] = { static_cast<T>(args)... };
return vec<T, N>(list[indices]...);
}
-}
+} // namespace internal
/// Create vector from scalar values
/// @code
@@ -710,7 +723,7 @@ constexpr CMT_INLINE vec<T1, N> operator^(const T1& x, const vec<T1, N>& y)
{
return static_cast<vec<T1, N>>(x) ^ y;
}
-}
+} // namespace operators
using namespace operators;
@@ -739,7 +752,7 @@ constexpr vec<T, Nout> partial_mask()
{
return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
}
-}
+} // namespace internal
template <typename T>
using optvec = vec<T, platform<T>::vector_capacity / 4>;
@@ -751,6 +764,7 @@ using f32x4 = vec<f32, 4>;
using f32x8 = vec<f32, 8>;
using f32x16 = vec<f32, 16>;
using f32x32 = vec<f32, 32>;
+using f32x64 = vec<f32, 64>;
using f64x1 = vec<f64, 1>;
using f64x2 = vec<f64, 2>;
using f64x3 = vec<f64, 3>;
@@ -758,6 +772,7 @@ using f64x4 = vec<f64, 4>;
using f64x8 = vec<f64, 8>;
using f64x16 = vec<f64, 16>;
using f64x32 = vec<f64, 32>;
+using f64x64 = vec<f64, 64>;
using i8x1 = vec<i8, 1>;
using i8x2 = vec<i8, 2>;
using i8x3 = vec<i8, 3>;
@@ -765,6 +780,7 @@ using i8x4 = vec<i8, 4>;
using i8x8 = vec<i8, 8>;
using i8x16 = vec<i8, 16>;
using i8x32 = vec<i8, 32>;
+using i8x64 = vec<i8, 64>;
using i16x1 = vec<i16, 1>;
using i16x2 = vec<i16, 2>;
using i16x3 = vec<i16, 3>;
@@ -772,6 +788,7 @@ using i16x4 = vec<i16, 4>;
using i16x8 = vec<i16, 8>;
using i16x16 = vec<i16, 16>;
using i16x32 = vec<i16, 32>;
+using i16x64 = vec<i16, 64>;
using i32x1 = vec<i32, 1>;
using i32x2 = vec<i32, 2>;
using i32x3 = vec<i32, 3>;
@@ -779,6 +796,7 @@ using i32x4 = vec<i32, 4>;
using i32x8 = vec<i32, 8>;
using i32x16 = vec<i32, 16>;
using i32x32 = vec<i32, 32>;
+using i32x64 = vec<i32, 64>;
using i64x1 = vec<i64, 1>;
using i64x2 = vec<i64, 2>;
using i64x3 = vec<i64, 3>;
@@ -786,6 +804,7 @@ using i64x4 = vec<i64, 4>;
using i64x8 = vec<i64, 8>;
using i64x16 = vec<i64, 16>;
using i64x32 = vec<i64, 32>;
+using i64x64 = vec<i64, 64>;
using u8x1 = vec<u8, 1>;
using u8x2 = vec<u8, 2>;
using u8x3 = vec<u8, 3>;
@@ -793,6 +812,7 @@ using u8x4 = vec<u8, 4>;
using u8x8 = vec<u8, 8>;
using u8x16 = vec<u8, 16>;
using u8x32 = vec<u8, 32>;
+using u8x64 = vec<u8, 64>;
using u16x1 = vec<u16, 1>;
using u16x2 = vec<u16, 2>;
using u16x3 = vec<u16, 3>;
@@ -800,6 +820,7 @@ using u16x4 = vec<u16, 4>;
using u16x8 = vec<u16, 8>;
using u16x16 = vec<u16, 16>;
using u16x32 = vec<u16, 32>;
+using u16x64 = vec<u16, 64>;
using u32x1 = vec<u32, 1>;
using u32x2 = vec<u32, 2>;
using u32x3 = vec<u32, 3>;
@@ -807,6 +828,7 @@ using u32x4 = vec<u32, 4>;
using u32x8 = vec<u32, 8>;
using u32x16 = vec<u32, 16>;
using u32x32 = vec<u32, 32>;
+using u32x64 = vec<u32, 64>;
using u64x1 = vec<u64, 1>;
using u64x2 = vec<u64, 2>;
using u64x3 = vec<u64, 3>;
@@ -814,6 +836,7 @@ using u64x4 = vec<u64, 4>;
using u64x8 = vec<u64, 8>;
using u64x16 = vec<u64, 16>;
using u64x32 = vec<u64, 32>;
+using u64x64 = vec<u64, 64>;
using u8x2x2 = vec<vec<u8, 2>, 2>;
using i8x2x2 = vec<vec<i8, 2>, 2>;
@@ -851,7 +874,7 @@ using ivec4 = i32x4;
using uvec2 = u32x2;
using uvec3 = u32x3;
using uvec4 = u32x4;
-}
+} // namespace glsl_names
namespace opencl_names
{
using char2 = i8x2;
@@ -909,7 +932,7 @@ using double3 = f64x3;
using double4 = f64x4;
using double8 = f64x8;
using double16 = f64x16;
-}
+} // namespace opencl_names
namespace internal
{
@@ -958,7 +981,7 @@ constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
{
return make_vector(((void)Indices, void(), fn())...);
}
-}
+} // namespace internal
template <typename T, size_t N, typename Fn, typename... Args,
typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
@@ -1048,7 +1071,7 @@ CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>)
}
KFR_FN(low)
KFR_FN(high)
-}
+} // namespace kfr
namespace cometa
{
@@ -1105,7 +1128,7 @@ struct compound_type_traits<kfr::mask<T, N>>
return value[index];
}
};
-}
+} // namespace cometa
namespace std
{
@@ -1140,7 +1163,7 @@ struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
{
using type = kfr::mask<typename common_type<T1, T2>::type, N>;
};
-}
+} // namespace std
CMT_PRAGMA_GNU(GCC diagnostic pop)
CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -20,6 +20,8 @@ extern char* gets(char* __s);
#define CMT_ARCH_X32 1
#endif
+#ifndef CMT_FORCE_GENERIC_CPU
+
#if defined __AVX512F__ && !defined CMT_ARCH_AVX512
#define CMT_ARCH_AVX512 1
#define CMT_ARCH_AVX2 1
@@ -107,6 +109,8 @@ extern char* gets(char* __s);
#define CMT_ARCH_LZCNT 1
#endif
+#endif // CMT_FORCE_GENERIC_CPU
+
#if defined CMT_ARCH_AVX512
#define CMT_ARCH_NAME avx512
#elif defined CMT_ARCH_AVX2
diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp
@@ -26,6 +26,7 @@
#pragma once
#include "../base/complex.hpp"
+#include "../base/filter.hpp"
#include "../base/constants.hpp"
#include "../base/memory.hpp"
#include "../base/read_write.hpp"
@@ -42,88 +43,41 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
namespace kfr
{
+namespace internal
+{
+template <typename T>
+univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
+template <typename T>
+univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
+template <typename T>
+univector<T> autocorrelate(const univector_ref<const T>& src1);
+} // namespace internal
+
template <typename T, size_t Tag1, size_t Tag2>
-CMT_FUNC univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
+univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
{
- const size_t size = next_poweroftwo(src1.size() + src2.size() - 1);
- univector<complex<T>> src1padded = src1;
- univector<complex<T>> src2padded = src2;
- src1padded.resize(size, 0);
- src2padded.resize(size, 0);
-
- dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
- univector<u8> temp(dft->temp_size);
- dft->execute(src1padded, src1padded, temp);
- dft->execute(src2padded, src2padded, temp);
- src1padded = src1padded * src2padded;
- dft->execute(src1padded, src1padded, temp, true);
- const T invsize = reciprocal<T>(size);
- return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+ return internal::convolve(src1.slice(), src2.slice());
}
template <typename T, size_t Tag1, size_t Tag2>
-CMT_FUNC univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
+univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
{
- const size_t size = next_poweroftwo(src1.size() + src2.size() - 1);
- univector<complex<T>> src1padded = src1;
- univector<complex<T>> src2padded = reverse(src2);
- src1padded.resize(size, 0);
- src2padded.resize(size, 0);
- dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
- univector<u8> temp(dft->temp_size);
- dft->execute(src1padded, src1padded, temp);
- dft->execute(src2padded, src2padded, temp);
- src1padded = src1padded * src2padded;
- dft->execute(src1padded, src1padded, temp, true);
- const T invsize = reciprocal<T>(size);
- return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+ return internal::correlate(src1.slice(), src2.slice());
}
template <typename T, size_t Tag1>
-CMT_FUNC univector<T> autocorrelate(const univector<T, Tag1>& src)
+univector<T> autocorrelate(const univector<T, Tag1>& src)
{
- univector<T> result = correlate(src, src);
- result = result.slice(result.size() / 2);
- return result;
+ return internal::autocorrelate(src.slice());
}
template <typename T>
class convolve_filter : public filter<T>
{
public:
- explicit convolve_filter(size_t size, size_t block_size = 1024)
- : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size),
- segments((size + block_size - 1) / block_size)
- {
- }
- explicit convolve_filter(const univector<T>& data, size_t block_size = 1024)
- : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)),
- temp(fft.temp_size),
- segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
- ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
- input_position(0), position(0)
- {
- set_data(data);
- }
- void set_data(const univector<T>& data)
- {
- univector<T> input(fft.size);
- const T ifftsize = reciprocal(T(fft.size));
- for (size_t i = 0; i < ir_segments.size(); i++)
- {
- segments[i].resize(block_size);
- ir_segments[i].resize(block_size, 0);
- input = padded(data.slice(i * block_size, block_size));
-
- fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm);
- process(ir_segments[i], ir_segments[i] * ifftsize);
- }
- saved_input.resize(block_size, 0);
- scratch.resize(block_size * 2);
- premul.resize(block_size, 0);
- cscratch.resize(block_size);
- overlap.resize(block_size, 0);
- }
+ explicit convolve_filter(size_t size, size_t block_size = 1024);
+ explicit convolve_filter(const univector<T>& data, size_t block_size = 1024);
+ void set_data(const univector<T>& data);
protected:
void process_expression(T* dest, const expression_pointer<T>& src, size_t size) final
@@ -131,49 +85,7 @@ protected:
univector<T> input = truncate(src, size);
process_buffer(dest, input.data(), input.size());
}
- void process_buffer(T* output, const T* input, size_t size) final
- {
- size_t processed = 0;
- while (processed < size)
- {
- const size_t processing = std::min(size - processed, block_size - input_position);
- internal::builtin_memcpy(saved_input.data() + input_position, input + processed,
- processing * sizeof(T));
-
- process(scratch, padded(saved_input));
- fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
-
- if (input_position == 0)
- {
- process(premul, zeros());
- for (size_t i = 1; i < segments.size(); i++)
- {
- const size_t n = (position + i) % segments.size();
- fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm);
- }
- }
- fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position],
- dft_pack_format::Perm);
-
- fft.execute(scratch, cscratch, temp, dft_pack_format::Perm);
-
- process(make_univector(output + processed, processing),
- scratch.slice(input_position) + overlap.slice(input_position));
-
- input_position += processing;
- if (input_position == block_size)
- {
- input_position = 0;
- process(saved_input, zeros());
-
- internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
-
- position = position > 0 ? position - 1 : segments.size() - 1;
- }
-
- processed += processing;
- }
- }
+ void process_buffer(T* output, const T* input, size_t size) final;
const dft_plan_real<T> fft;
univector<u8> temp;
@@ -189,5 +101,6 @@ protected:
univector<T> overlap;
size_t position;
};
-}
+
+} // namespace kfr
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dft/dft-src.cpp b/include/kfr/dft/dft-src.cpp
@@ -24,10 +24,13 @@
See https://www.kfrlib.com for details.
*/
+#include "../base/basic_expressions.hpp"
+#include "../testo/assert.hpp"
#include "bitrev.hpp"
+#include "cache.hpp"
+#include "convolution.hpp"
#include "fft.hpp"
#include "ft.hpp"
-#include "../testo/assert.hpp"
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wshadow")
@@ -248,7 +251,7 @@ template <typename T, size_t width>
CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
{
const size_t count = stage_size / 4;
- size_t nnstep = size / stage_size;
+ size_t nnstep = size / stage_size;
DFT_ASSERT(width <= count);
CMT_LOOP_NOUNROLL
for (size_t n = 0; n < count; n += width)
@@ -259,7 +262,7 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s
}
}
-#ifdef CMT_ARCH_X86
+#if defined CMT_ARCH_SSE
#ifdef CMT_COMPILER_GNU
#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
#else
@@ -484,11 +487,11 @@ struct fft_final_stage_impl : dft_stage<T>
}
protected:
- constexpr static size_t width = fft_vector_width<T>;
- constexpr static bool is_even = cometa::is_even(ilog2(size));
- constexpr static bool use_br2 = !is_even;
- constexpr static bool aligned = false;
- constexpr static bool prefetch = splitin;
+ constexpr static size_t width = fft_vector_width<T>;
+ constexpr static bool is_even = cometa::is_even(ilog2(size));
+ constexpr static bool use_br2 = !is_even;
+ constexpr static bool aligned = false;
+ constexpr static bool prefetch = splitin;
KFR_INTRIN void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
KFR_INTRIN void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
@@ -510,7 +513,7 @@ protected:
virtual void do_execute(complex<T>* out, const complex<T>* in, u8* /*temp*/) override
{
- const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
final_stage(csize<size>, 1, cbool<splitin>, out, in, twiddle);
}
@@ -1064,6 +1067,164 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_
template <typename T>
dft_plan<T>::~dft_plan() = default;
+namespace internal
+{
+
+template <typename T>
+univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2)
+{
+ const size_t size = next_poweroftwo(src1.size() + src2.size() - 1);
+ univector<complex<T>> src1padded = src1;
+ univector<complex<T>> src2padded = src2;
+ src1padded.resize(size, 0);
+ src2padded.resize(size, 0);
+
+ dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
+ univector<u8> temp(dft->temp_size);
+ dft->execute(src1padded, src1padded, temp);
+ dft->execute(src2padded, src2padded, temp);
+ src1padded = src1padded * src2padded;
+ dft->execute(src1padded, src1padded, temp, true);
+ const T invsize = reciprocal<T>(size);
+ return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+}
+
+template <typename T>
+univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2)
+{
+ const size_t size = next_poweroftwo(src1.size() + src2.size() - 1);
+ univector<complex<T>> src1padded = src1;
+ univector<complex<T>> src2padded = reverse(src2);
+ src1padded.resize(size, 0);
+ src2padded.resize(size, 0);
+ dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
+ univector<u8> temp(dft->temp_size);
+ dft->execute(src1padded, src1padded, temp);
+ dft->execute(src2padded, src2padded, temp);
+ src1padded = src1padded * src2padded;
+ dft->execute(src1padded, src1padded, temp, true);
+ const T invsize = reciprocal<T>(size);
+ return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
+}
+
+template <typename T>
+univector<T> autocorrelate(const univector_ref<const T>& src1)
+{
+ return internal::autocorrelate(src1.slice());
+ univector<T> result = correlate(src1, src1);
+ result = result.slice(result.size() / 2);
+ return result;
+}
+
+template univector<float> convolve<float>(const univector_ref<const float>&,
+ const univector_ref<const float>&);
+template univector<double> convolve<double>(const univector_ref<const double>&,
+ const univector_ref<const double>&);
+template univector<float> correlate<float>(const univector_ref<const float>&,
+ const univector_ref<const float>&);
+template univector<double> correlate<double>(const univector_ref<const double>&,
+ const univector_ref<const double>&);
+
+template univector<float> autocorrelate<float>(const univector_ref<const float>&);
+template univector<double> autocorrelate<double>(const univector_ref<const double>&);
+
+} // namespace internal
+
+template <typename T>
+convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
+ : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size),
+ segments((size + block_size - 1) / block_size)
+{
+}
+
+template <typename T>
+convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size)
+ : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)),
+ temp(fft.temp_size),
+ segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
+ ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
+ input_position(0), position(0)
+{
+ set_data(data);
+}
+
+template <typename T>
+void convolve_filter<T>::set_data(const univector<T>& data)
+{
+ univector<T> input(fft.size);
+ const T ifftsize = reciprocal(T(fft.size));
+ for (size_t i = 0; i < ir_segments.size(); i++)
+ {
+ segments[i].resize(block_size);
+ ir_segments[i].resize(block_size, 0);
+ input = padded(data.slice(i * block_size, block_size));
+
+ fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm);
+ process(ir_segments[i], ir_segments[i] * ifftsize);
+ }
+ saved_input.resize(block_size, 0);
+ scratch.resize(block_size * 2);
+ premul.resize(block_size, 0);
+ cscratch.resize(block_size);
+ overlap.resize(block_size, 0);
+}
+
+template <typename T>
+void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
+{
+ size_t processed = 0;
+ while (processed < size)
+ {
+ const size_t processing = std::min(size - processed, block_size - input_position);
+ internal::builtin_memcpy(saved_input.data() + input_position, input + processed,
+ processing * sizeof(T));
+
+ process(scratch, padded(saved_input));
+ fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
+
+ if (input_position == 0)
+ {
+ process(premul, zeros());
+ for (size_t i = 1; i < segments.size(); i++)
+ {
+ const size_t n = (position + i) % segments.size();
+ fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm);
+ }
+ }
+ fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm);
+
+ fft.execute(scratch, cscratch, temp, dft_pack_format::Perm);
+
+ process(make_univector(output + processed, processing),
+ scratch.slice(input_position) + overlap.slice(input_position));
+
+ input_position += processing;
+ if (input_position == block_size)
+ {
+ input_position = 0;
+ process(saved_input, zeros());
+
+ internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
+
+ position = position > 0 ? position - 1 : segments.size() - 1;
+ }
+
+ processed += processing;
+ }
+}
+
+template convolve_filter<float>::convolve_filter(size_t, size_t);
+template convolve_filter<double>::convolve_filter(size_t, size_t);
+
+template convolve_filter<float>::convolve_filter(const univector<float>&, size_t);
+template convolve_filter<double>::convolve_filter(const univector<double>&, size_t);
+
+template void convolve_filter<float>::set_data(const univector<float>&);
+template void convolve_filter<double>::set_data(const univector<double>&);
+
+template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
+template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
+
template dft_plan<float>::dft_plan(size_t, cbools_t<false, true>);
template dft_plan<float>::dft_plan(size_t, cbools_t<true, false>);
template dft_plan<float>::dft_plan(size_t, cbools_t<true, true>);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -15,7 +15,7 @@
# along with KFR.
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.0)
set(TEST_SRC
)
@@ -38,16 +38,65 @@ endif ()
find_package(MPFR)
+set(ALL_TESTS_CPP
+ all_tests.cpp
+ base_test.cpp
+ complex_test.cpp
+ dft_test.cpp
+ dsp_test.cpp
+ expression_test.cpp
+ intrinsic_test.cpp)
+
+if (MPFR_FOUND)
+ list(APPEND ALL_TESTS_CPP transcendental_test.cpp)
+endif ()
+
+add_executable(all_tests ${ALL_TESTS_CPP} ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp)
+target_compile_definitions(all_tests PRIVATE KFR_NO_MAIN)
+
add_executable(intrinsic_test intrinsic_test.cpp ${KFR_SRC} ${TEST_SRC})
add_executable(dft_test dft_test.cpp ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp)
if (MPFR_FOUND)
+ add_definitions(-DHAVE_MPFR)
include_directories(${MPFR_INCLUDE_DIR})
add_executable(transcendental_test transcendental_test.cpp ${KFR_SRC} ${TEST_SRC})
+ target_link_libraries(all_tests ${MPFR_LIBRARIES})
target_link_libraries(transcendental_test ${MPFR_LIBRARIES})
if (WIN32)
target_link_libraries(transcendental_test gmp) # for MSYS
+ target_link_libraries(all_tests gmp) # for MSYS
endif()
endif ()
+
+function(add_x86_test NAME FLAGS)
+ separate_arguments(FLAGS)
+ add_executable(all_tests_${NAME} ${ALL_TESTS_CPP} ${KFR_SRC} ${TEST_SRC} ../include/kfr/dft/dft-src.cpp)
+ target_compile_options(all_tests_${NAME} PRIVATE ${FLAGS})
+ target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN)
+ if(MPFR_FOUND)
+ target_link_libraries(all_tests_${NAME} ${MPFR_LIBRARIES})
+ if (WIN32)
+ target_link_libraries(all_tests_${NAME} gmp) # for MSYS
+ endif()
+ endif()
+endfunction()
+
+message(STATUS CMAKE_SYSTEM_PROCESSOR = ${CMAKE_SYSTEM_PROCESSOR})
+
+set(ARCH_TESTS 0)
+
+if (CLANG AND CMAKE_SYSTEM_PROCESSOR MATCHES "([aA][mM][dD]64)")
+ set(ARCH_TESTS 1)
+ add_x86_test(generic "-march=x86-64 -mno-avx -DCMT_FORCE_GENERIC_CPU")
+ add_x86_test(sse2 "-march=x86-64 -msse2")
+ add_x86_test(sse3 "-march=x86-64 -msse3 -mno-avx")
+ add_x86_test(ssse3 "-march=x86-64 -mssse3 -mno-avx")
+ add_x86_test(sse41 "-march=x86-64 -msse4.1 -mno-avx")
+ add_x86_test(avx "-march=x86-64 -msse4.1 -mavx")
+ add_x86_test(avx2 "-march=x86-64 -msse4.1 -mavx2 -mfma")
+ add_x86_test(avx512 "-march=x86-64 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
+endif()
+
add_executable(dsp_test dsp_test.cpp ${KFR_SRC} ${TEST_SRC})
add_executable(empty_test empty_test.cpp ${KFR_SRC} ${TEST_SRC})
add_executable(complex_test complex_test.cpp ${KFR_SRC} ${TEST_SRC})
@@ -56,7 +105,11 @@ add_executable(expression_test expression_test.cpp ${KFR_SRC} ${TEST_SRC})
add_executable(ebu_test ebu_test.cpp ${KFR_SRC} ${TEST_SRC})
-if (ARM)
+if(USE_SDE)
+ find_program(EMULATOR "sde")
+ list(APPEND EMULATOR "-skx")
+ list(APPEND EMULATOR "--")
+elseif (ARM)
find_program(EMULATOR "qemu-arm")
else ()
set(EMULATOR "")
@@ -86,4 +139,15 @@ if (NOT IOS)
endif ()
add_test(NAME dft_test
COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/dft_test)
+
+ if(ARCH_TESTS)
+ add_test(NAME generic COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_generic )
+ add_test(NAME sse2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse2 )
+ add_test(NAME sse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse3 )
+ add_test(NAME ssse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_ssse3 )
+ add_test(NAME sse41 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse41 )
+ add_test(NAME avx COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx )
+ add_test(NAME avx2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx2 )
+ add_test(NAME avx512 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx512 )
+ endif()
endif ()
diff --git a/tests/all_tests.cpp b/tests/all_tests.cpp
@@ -0,0 +1,22 @@
+#include <kfr/io/tostring.hpp>
+#include <kfr/testo/testo.hpp>
+#include <kfr/version.hpp>
+#ifdef HAVE_MPFR
+#include "mpfr/mpfrplus.hpp"
+#endif
+
+using namespace kfr;
+
+int main()
+{
+ println(library_version(), " running on ", cpu_runtime());
+ if (get_cpu() < cpu_t::native)
+ {
+ println("CPU is not supported");
+ return -1;
+ }
+#ifdef HAVE_MPFR
+ mpfr::scoped_precision p(128);
+#endif
+ return testo::run_all("");
+}
diff --git a/tests/base_test.cpp b/tests/base_test.cpp
@@ -377,8 +377,10 @@ TEST(test_stat)
}
}
+#ifndef KFR_NO_MAIN
int main()
{
println(library_version());
return testo::run_all("", true);
}
+#endif
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -210,9 +210,11 @@ TEST(static_tests)
testo::assert_is_same<common_type<complex<int>, double>, complex<double>>();
}
+#ifndef KFR_NO_MAIN
int main()
{
println(library_version());
return testo::run_all("", true);
}
+#endif
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -14,9 +14,9 @@
using namespace kfr;
#ifdef KFR_NATIVE_F64
-constexpr ctypes_t<float, double> float_types{};
+constexpr ctypes_t<float, double> dft_float_types{};
#else
-constexpr ctypes_t<float> float_types{};
+constexpr ctypes_t<float> dft_float_types{};
#endif
TEST(test_convolve)
@@ -62,7 +62,7 @@ TEST(fft_real)
kfr::univector<float_type, size> in = gen_random_range<float_type>(gen, -1.0, +1.0);
kfr::univector<kfr::complex<float_type>, size / 2 + 1> out = realdft(in);
- kfr::univector<float_type, size> rev = irealdft(out) / size;
+ kfr::univector<float_type, size> rev = irealdft(out) / size;
CHECK(rms(rev - in) <= 0.00001f);
}
@@ -71,7 +71,7 @@ TEST(fft_accuracy)
testo::active_test()->show_progress = true;
random_bit_generator gen(2247448713, 915890490, 864203735, 2982561);
- testo::matrix(named("type") = float_types, //
+ testo::matrix(named("type") = dft_float_types, //
named("inverse") = std::make_tuple(false, true), //
named("log2(size)") = make_range(size_t(1), stopsize), //
[&gen](auto type, bool inverse, size_t log2size) {
@@ -115,16 +115,18 @@ TEST(fft_accuracy)
univector<float_type> out2(size, 0.f);
dft.execute(out2, out, temp);
- out2 = out2 / size;
+ out2 = out2 / size;
const float_type rms_diff_r2 = rms(in - out2);
CHECK(rms_diff_r2 < epsilon * ops);
}
});
}
+#ifndef KFR_NO_MAIN
int main()
{
println(library_version(), " running on ", cpu_runtime());
return testo::run_all("", true);
}
+#endif
diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp
@@ -306,8 +306,10 @@ TEST(fir)
});
}
+#ifndef KFR_NO_MAIN
int main()
{
println(library_version());
return testo::run_all("", true);
}
+#endif
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -146,9 +146,11 @@ TEST(partition)
}
}
+#ifndef KFR_NO_MAIN
int main()
{
println(library_version());
return testo::run_all("", true);
}
+#endif
diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp
@@ -12,8 +12,8 @@
using namespace kfr;
-constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, //
- i16x1, i16x2, i16x4, i16x8, i16x16, i16x3, //
+constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64, i8x3, //
+ i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x3, //
i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 //
#ifdef KFR_NATIVE_I64
,
@@ -22,8 +22,8 @@ constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x3, //
>
signed_types{};
-constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x3, //
- u16x1, u16x2, u16x4, u16x8, u16x16, u16x3, //
+constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64, u8x3, //
+ u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x3, //
u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 //
#ifdef KFR_NATIVE_I64
,
@@ -127,6 +127,16 @@ inline T ref_satsub(T x, T y)
return result;
}
+TEST(intrin_select)
+{
+ testo::matrix(named("type") = cconcat(signed_types, cconcat(unsigned_types, float_types)), [](auto type) {
+ using Tvec = type_of<decltype(type)>;
+ using T = subtype<Tvec>;
+ CHECK(kfr::select(make_mask<T>(false), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(2));
+ CHECK(kfr::select(make_mask<T>(true), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(1));
+ });
+}
+
TEST(intrin_abs)
{
testo::assert_is_same<decltype(kfr::abs(1)), int>();
@@ -387,4 +397,10 @@ TEST(intrin_math)
CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
}
-int main() { return testo::run_all("", false); }
+#ifndef KFR_NO_MAIN
+int main()
+{
+ println(library_version(), " running on ", cpu_runtime());
+ return testo::run_all("", false);
+}
+#endif
diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp
@@ -14,6 +14,8 @@
using namespace kfr;
+using vector_types = ctypes_t<f32, f64, f32x2, f32x8, f32x16, f64x2, f64x4, f64x8>;
+
template <typename T>
double ulps(T test, const mpfr::number& ref)
{
@@ -25,15 +27,72 @@ double ulps(T test, const mpfr::number& ref)
mpfr::abs(mpfr::number(test) - std::nexttoward(test, HUGE_VALL)));
}
+template <typename T, size_t N>
+double ulps(const vec<T, N>& test, const mpfr::number& ref)
+{
+ double u = 0;
+ for (size_t i = 0; i < N; ++i)
+ u = std::max(u, ulps(test[i], ref));
+ return u;
+}
+
TEST(test_sin_cos)
{
- testo::matrix(named("type") = ctypes_t<f32, f64>(),
+ testo::matrix(named("type") = vector_types(),
named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05),
[](auto type, double value) {
using T = type_of<decltype(type)>;
const T x(value);
- CHECK(ulps(kfr::sin(x), mpfr::sin(x)) < 2.0);
- CHECK(ulps(kfr::cos(x), mpfr::cos(x)) < 2.0);
+ CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
+ CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
+ });
+ testo::matrix(named("type") = vector_types(), named("value") = make_range(-100.0, 100.0, 0.5),
+ [](auto type, double value) {
+ using T = type_of<decltype(type)>;
+ const T x(value);
+ CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
+ CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
+ });
+}
+
+TEST(test_tan)
+{
+ testo::matrix(named("type") = ctypes_t<f32>(),
+ named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.01),
+ [](auto type, double value) {
+ using T = type_of<decltype(type)>;
+ const T x(value);
+ CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 2.0);
+ });
+ testo::matrix(named("type") = ctypes_t<f32>(), named("value") = make_range(-100.0, 100.0, 0.5),
+ [](auto type, double value) {
+ using T = type_of<decltype(type)>;
+ const T x(value);
+ CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 3.0);
+ });
+}
+
+TEST(test_asin_acos_atan)
+{
+ testo::matrix(named("type") = vector_types(), named("value") = make_range(-1.0, 1.0, 0.05),
+ [](auto type, double value) {
+ using T = type_of<decltype(type)>;
+ const T x(value);
+ CHECK(ulps(kfr::asin(x), mpfr::asin(subtype<T>(value))) < 2.0);
+ CHECK(ulps(kfr::acos(x), mpfr::acos(subtype<T>(value))) < 2.0);
+ CHECK(ulps(kfr::atan(x), mpfr::atan(subtype<T>(value))) < 2.0);
+ });
+}
+
+TEST(test_atan2)
+{
+ testo::matrix(named("type") = vector_types(), named("value1") = make_range(-1.0, 1.0, 0.1),
+ named("value2") = make_range(-1.0, 1.0, 0.1), [](auto type, double value1, double value2) {
+ using T = type_of<decltype(type)>;
+ const T x(value1);
+ const T y(value2);
+ CHECK(ulps(kfr::atan2(x, y), mpfr::atan2(subtype<T>(value1), subtype<T>(value2))) <
+ 2.0);
});
}
@@ -97,8 +156,11 @@ TEST(test_exp10)
});
}
+#ifndef KFR_NO_MAIN
int main()
{
+ println(library_version(), " running on ", cpu_runtime());
mpfr::scoped_precision p(128);
return testo::run_all("");
}
+#endif