kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 940f5d9c26ef2a724a5faf4f421470ca6ef72564
parent 51d4e60bd55b1a68556ac5ca2d443c970769adb1
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Wed, 12 Oct 2022 14:40:12 +0100

Refactoring

Diffstat:
Minclude/kfr/base.hpp | 2--
Minclude/kfr/base/basic_expressions.hpp | 121+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Minclude/kfr/base/conversion.hpp | 2+-
Minclude/kfr/base/expression.hpp | 25+++++++++++++++++++++++--
Minclude/kfr/base/generators.hpp | 226+++++++++++++++++++++++++++++++++++++++----------------------------------------
Minclude/kfr/base/pointer.hpp | 204+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Minclude/kfr/base/random.hpp | 80+------------------------------------------------------------------------------
Ainclude/kfr/base/random_bits.hpp | 114+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/base/reduce.hpp | 2+-
Minclude/kfr/base/shape.hpp | 36+++++++++++++++++++++++++++---------
Dinclude/kfr/base/sort.hpp | 103-------------------------------------------------------------------------------
Minclude/kfr/base/tensor.hpp | 6+++---
Minclude/kfr/math.hpp | 7-------
Minclude/kfr/math/complex_math.hpp | 6+++---
Dinclude/kfr/math/impl/abs.hpp | 138-------------------------------------------------------------------------------
Minclude/kfr/math/impl/asin_acos.hpp | 6+++---
Minclude/kfr/math/impl/atan.hpp | 6+++---
Dinclude/kfr/math/impl/clamp.hpp | 55-------------------------------------------------------
Minclude/kfr/math/impl/hyperbolic.hpp | 8++++----
Minclude/kfr/math/impl/log_exp.hpp | 10+++++-----
Dinclude/kfr/math/impl/logical.hpp | 284-------------------------------------------------------------------------------
Dinclude/kfr/math/impl/min_max.hpp | 236-------------------------------------------------------------------------------
Dinclude/kfr/math/impl/round.hpp | 282-------------------------------------------------------------------------------
Dinclude/kfr/math/impl/saturation.hpp | 205-------------------------------------------------------------------------------
Dinclude/kfr/math/impl/select.hpp | 331-------------------------------------------------------------------------------
Minclude/kfr/math/impl/sin_cos.hpp | 8++++----
Minclude/kfr/math/impl/tan.hpp | 6+++---
Minclude/kfr/math/interpolation.hpp | 2+-
Minclude/kfr/simd.hpp | 8++++++++
Rinclude/kfr/math/abs.hpp -> include/kfr/simd/abs.hpp | 0
Rinclude/kfr/math/clamp.hpp -> include/kfr/simd/clamp.hpp | 0
Ainclude/kfr/simd/impl/abs.hpp | 138+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/clamp.hpp | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/logical.hpp | 284+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/min_max.hpp | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/round.hpp | 282+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/saturation.hpp | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/kfr/simd/impl/select.hpp | 331+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rinclude/kfr/math/logical.hpp -> include/kfr/simd/logical.hpp | 0
Rinclude/kfr/math/min_max.hpp -> include/kfr/simd/min_max.hpp | 0
Rinclude/kfr/math/round.hpp -> include/kfr/simd/round.hpp | 0
Rinclude/kfr/math/saturation.hpp -> include/kfr/simd/saturation.hpp | 0
Rinclude/kfr/math/select.hpp -> include/kfr/simd/select.hpp | 0
Ainclude/kfr/simd/sort.hpp | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msources.cmake | 40+++++++++++++++++++++-------------------
Rtests/unit/math/abs.cpp -> tests/unit/simd/abs.cpp | 0
Rtests/unit/math/min_max.cpp -> tests/unit/simd/min_max.cpp | 0
Rtests/unit/math/round.cpp -> tests/unit/simd/round.cpp | 0
Rtests/unit/math/select.cpp -> tests/unit/simd/select.cpp | 0
49 files changed, 2199 insertions(+), 1994 deletions(-)

diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp @@ -29,7 +29,6 @@ #include "base/expression.hpp" #include "base/filter.hpp" #include "base/fraction.hpp" -#include "base/function_expressions.hpp" #include "base/generators.hpp" #include "base/math_expressions.hpp" #include "base/memory.hpp" @@ -38,5 +37,4 @@ #include "base/reduce.hpp" #include "base/simd_expressions.hpp" #include "base/small_buffer.hpp" -#include "base/sort.hpp" #include "base/univector.hpp" diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp @@ -93,8 +93,8 @@ struct expression_traits<xcounter<T, Dims>> : expression_traits_defaults using value_type = T; constexpr static size_t dims = Dims; - constexpr static shape<dims> shapeof(const xcounter<T, Dims>& self) { return shape<dims>(max_index_t); } - constexpr static shape<dims> shapeof() { return shape<dims>(max_index_t); } + constexpr static shape<dims> shapeof(const xcounter<T, Dims>& self) { return shape<dims>(infinite_size); } + constexpr static shape<dims> shapeof() { return shape<dims>(infinite_size); } }; template <typename T, typename... Args, typename Tout = std::common_type_t<T, Args...>> @@ -151,9 +151,9 @@ struct expression_traits<xslice<Arg>> : expression_traits_defaults KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xslice<Arg>& self) { - return min(ArgTraits::shapeof(self.first()).sub_inf(self.start), self.size); + return min(sub_shape(ArgTraits::shapeof(self.first()), self.start), self.size); } - KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(0); } + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(undefined_size); } }; template <typename Arg, KFR_ACCEPT_EXPRESSIONS(Arg), index_t Dims = expression_dims<Arg>> @@ -333,8 +333,11 @@ struct expression_traits<xpadded<Arg>> : expression_traits_defaults constexpr static size_t dims = ArgTraits::dims; constexpr static bool random_access = ArgTraits::random_access; - KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xpadded<Arg>& self) { return infinite_size; } - KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return infinite_size; } + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xpadded<Arg>& self) + { + return shape<dims>(infinite_size); + } + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(infinite_size); } }; inline namespace CMT_ARCH_NAME @@ -516,7 +519,7 @@ struct expression_traits<xreshape<Arg, OutDims>> : expression_traits_defaults { return self.out_shape; } - KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ 0 }; } + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ undefined_size }; } }; inline namespace CMT_ARCH_NAME @@ -647,7 +650,7 @@ struct expression_traits<xlinspace<T, truncated>> : expression_traits_defaults { return shape<dims>(truncated ? self.size : infinite_size); } - constexpr static shape<dims> shapeof() { return shape<dims>(truncated ? 0 : infinite_size); } + constexpr static shape<dims> shapeof() { return shape<dims>(truncated ? undefined_size : infinite_size); } }; template <bool truncated = false, typename T1, typename T2, typename Tout = std::common_type_t<T1, T2>> @@ -672,4 +675,106 @@ KFR_INTRINSIC vec<T, N> get_elements(const xlinspace<T, truncated>& self, const // ---------------------------------------------------------------------------- +template <typename Arg1, typename Arg2, index_t ConcatAxis> +struct xconcatenate : public xwitharguments<Arg1, Arg2> +{ + static_assert(expression_dims<Arg1> == expression_dims<Arg2>); + static_assert(std::is_same_v<expression_value_type<Arg1>, expression_value_type<Arg2>>); + constexpr static index_t dims = expression_dims<Arg1>; + shape<dims> size1; + + KFR_MEM_INTRINSIC xconcatenate(Arg1&& arg1, Arg2&& arg2) + : xwitharguments<Arg1, Arg2>{ std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) }, + size1(expression_traits<Arg1>::shapeof(arg1)) + { + } +}; + +template <typename Arg1, typename Arg2, index_t ConcatAxis> +struct expression_traits<xconcatenate<Arg1, Arg2, ConcatAxis>> : expression_traits_defaults +{ + using ArgTraits1 = expression_traits<Arg1>; + using ArgTraits2 = expression_traits<Arg2>; + + using value_type = typename ArgTraits1::value_type; + constexpr static size_t dims = ArgTraits1::dims; + constexpr static bool random_access = ArgTraits1::random_access && ArgTraits2::random_access; + + KFR_INTRINSIC static shape<dims> concat_shape(const shape<dims>& sh1, const shape<dims>& sh2) + { + shape<dims> result = min(sh1, sh2); + shape<dims> sum = add_shape_undef(sh1, sh2); + result[ConcatAxis] = sum[ConcatAxis]; + return result; + } + + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xconcatenate<Arg1, Arg2, ConcatAxis>& self) + { + return concat_shape(ArgTraits1::shapeof(std::get<0>(self)), ArgTraits2::shapeof(std::get<1>(self))); + } + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() + { + return concat_shape(ArgTraits1::shapeof(), ArgTraits2::shapeof()); + } +}; + +template <index_t ConcatAxis = 0, typename Arg1, typename Arg2, KFR_ACCEPT_EXPRESSIONS(Arg1, Arg2)> +KFR_INTRINSIC xconcatenate<Arg1, Arg2, ConcatAxis> concatenate(Arg1&& arg1, Arg2&& arg2) +{ + return { std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) }; +} + +template <index_t ConcatAxis = 0, typename Arg1, typename Arg2, typename Arg3, + KFR_ACCEPT_EXPRESSIONS(Arg1, Arg2, Arg3)> +KFR_INTRINSIC xconcatenate<Arg1, xconcatenate<Arg2, Arg3, ConcatAxis>, ConcatAxis> concatenate(Arg1&& arg1, + Arg2&& arg2, + Arg3&& arg3) +{ + return { std::forward<Arg1>(arg1), { std::forward<Arg2>(arg2), std::forward<Arg3>(arg3) } }; +} + +inline namespace CMT_ARCH_NAME +{ + +template <typename Arg1, typename Arg2, index_t ConcatAxis, index_t NDims, index_t Axis, size_t N, + typename T = typename expression_traits<xconcatenate<Arg1, Arg2, ConcatAxis>>::value_type> +KFR_INTRINSIC vec<T, N> get_elements(const xconcatenate<Arg1, Arg2, ConcatAxis>& self, + const shape<NDims>& index, const axis_params<Axis, N>& sh) +{ + const index_t size1 = self.size1; + constexpr index_t Naxis = ConcatAxis == Axis ? N : 1; + if (index[ConcatAxis] >= size1[ConcatAxis]) + { + shape index1 = index; + index1[ConcatAxis] -= size1[ConcatAxis]; + return get_elements(std::get<1>(self.args), index1, sh); + } + else if (CMT_LIKELY(index[ConcatAxis] + Naxis <= size1[ConcatAxis])) + { + return get_elements(std::get<0>(self.args), index, sh); + } + else // (index < size1) && (index + N > size1) + { + vec<T, N> result; + // Here Axis == ConcatAxis + shape index1 = index; + for (index_t i = 0; i < size1[ConcatAxis] - index[ConcatAxis]; ++i) + { + result[i] = get_elements(std::get<0>(self.args), index1, axis_params<Axis, 1>{})[0]; + ++index1[ConcatAxis]; + } + index1[ConcatAxis] -= size1[ConcatAxis]; + for (index_t i = size1[ConcatAxis] - index[ConcatAxis]; i < N; ++i) + { + result[i] = get_elements(std::get<1>(self.args), index1, axis_params<Axis, 1>{})[0]; + ++index1[ConcatAxis]; + } + return result; + } +} + +} // namespace CMT_ARCH_NAME + +// ---------------------------------------------------------------------------- + } // namespace kfr diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp @@ -25,7 +25,7 @@ */ #pragma once -#include "../math/clamp.hpp" +#include "../simd/clamp.hpp" #include "../simd/types.hpp" #include "../simd/vec.hpp" #include "univector.hpp" diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp @@ -131,7 +131,7 @@ using enable_if_output_expression = vec<typename expression_traits<T>::value_type, 1>{}))>; template <typename T> -using enable_if_inout_output_expression = +using enable_if_input_output_expression = std::void_t<enable_if_input_expression<T>, enable_if_output_expression<T>>; template <typename... T> @@ -140,6 +140,27 @@ using enable_if_input_expressions = std::void_t<enable_if_input_expression<T>... template <typename... T> using enable_if_output_expressions = std::void_t<enable_if_output_expression<T>...>; +template <typename... T> +using enable_if_input_output_expressions = std::void_t<enable_if_input_output_expression<T>...>; + +template <typename E, typename = void> +constexpr inline bool is_input_expression = false; + +template <typename E> +constexpr inline bool is_input_expression<E, enable_if_input_expression<E>> = true; + +template <typename E, typename = void> +constexpr inline bool is_output_expression = false; + +template <typename E> +constexpr inline bool is_output_expression<E, enable_if_output_expression<E>> = true; + +template <typename E, typename = void> +constexpr inline bool is_input_output_expression = false; + +template <typename E> +constexpr inline bool is_input_output_expression<E, enable_if_input_output_expression<E>> = true; + #define KFR_ACCEPT_EXPRESSIONS(...) internal_generic::expressions_check<__VA_ARGS__>* = nullptr template <typename T> @@ -311,7 +332,7 @@ static auto process(Out&& out, In&& in, shape<outdims> start = shape<outdims>(0) const shape<indims> inshape = Trin::shapeof(in); if (CMT_UNLIKELY(!internal_generic::can_assign_from(outshape, inshape))) return shape<outdims>{ 0 }; - shape<outdims> stop = min(start.add_inf(size), outshape); + shape<outdims> stop = min(add_shape(start, size), outshape); index_t in_size = 0; if constexpr (indims > 0) diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp @@ -26,107 +26,108 @@ #pragma once #include "../math/log_exp.hpp" -#include "../math/select.hpp" #include "../math/sin_cos.hpp" #include "../simd/impl/function.hpp" +#include "../simd/select.hpp" #include "../simd/vec.hpp" +#include "shape.hpp" namespace kfr { -inline namespace CMT_ARCH_NAME -{ - -namespace internal -{ -template <typename T, size_t width_, typename Class> -struct generator : input_expression +template <typename T, size_t VecWidth, typename Class, typename Twork = T> +struct xgenerator { - constexpr static size_t width = width_; - using value_type = T; - - constexpr static bool is_incremental = true; - - template <typename U, size_t N> - friend KFR_INTRINSIC vec<U, N> get_elements(const generator& self, cinput_t, size_t index, - vec_shape<U, N> t) - { - return self.generate(t); - } + constexpr static size_t width = VecWidth; void resync(T start) const { ptr_cast<Class>(this)->sync(start); } -protected: - void call_next() const { ptr_cast<Class>(this)->next(); } template <size_t N> - void call_shift(csize_t<N>) const + KFR_MEM_INTRINSIC vec<T, N> generate() const { - ptr_cast<Class>(this)->shift(csize_t<N>()); + if constexpr (N < width) + { + const vec<T, N> result = narrow<N>(call_get_value()); + const vec<Twork, width> oldvalue = value; + call_next(); + value = slice<N, width>(oldvalue, value); + return result; + } + else if (N > width) + { + const vec lo = generate(low(x)); + const vec hi = generate(high(x)); + return concat(lo, hi); + } + else // N == width + { + const vec<T, N> result = call_get_value(); + call_next(); + return result; + } } + mutable vec<Twork, width> value; - template <size_t N> - void shift(csize_t<N>) const - { - const vec<T, width> oldvalue = value; - call_next(); - value = slice<N, width>(oldvalue, value); - } +private: + KFR_MEM_INTRINSIC void call_next() const { ptr_cast<Class>(this)->next(); } - template <size_t N, KFR_ENABLE_IF(N == width)> - KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const - { - const vec<T, N> result = value; - call_next(); - return result; - } + KFR_MEM_INTRINSIC vec<T, width> call_get_value() const { return ptr_cast<Class>(this)->get_value(); } - template <size_t N, KFR_ENABLE_IF(N < width)> - KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const + KFR_MEM_INTRINSIC std::enable_if_t<std::is_same_v<T, Twork>, vec<T, width>> get_value() const { - const vec<T, N> result = narrow<N>(value); - call_shift(csize_t<N>()); - return result; + return value; } +}; - template <size_t N, KFR_ENABLE_IF(N > width)> - KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N> x) const - { - const auto lo = generate(low(x)); - const auto hi = generate(high(x)); - return concat(lo, hi); - } +template <typename T, size_t VecWidth, typename Class> +struct expression_traits<xgenerator<T, VecWidth, Class>> : public expression_traits_defaults +{ + using value_type = T; + constexpr static size_t dims = 1; + constexpr static shape<1> shapeof(const T&) { return shape<1>(infinite_size); } + constexpr static shape<1> shapeof() { return shape<1>(infinite_size); } - mutable vec<T, width> value; + constexpr static inline bool explicit_operand = true; + constexpr static inline bool random_access = false; }; -template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)> -struct generator_linear : generator<T, width, generator_linear<T, width>> +inline namespace CMT_ARCH_NAME +{ +template <typename T, size_t VecWidth, typename Class, size_t N> +KFR_INTRINSIC vec<T, N> get_elements(const xgenerator<T, VecWidth, Class>& self, const shape<1>& index, + const axis_params<0, N>&) +{ + return self.template generate<N>(); +} +} // namespace CMT_ARCH_NAME + +template <typename T, size_t VecWidth = vector_capacity<T> / 8> +struct xgenlinear : public xgenerator<T, VecWidth, xgenlinear<T, VecWidth>> { - generator_linear(T start, T step) CMT_NOEXCEPT : step(step), vstep(step* width) { this->resync(start); } + xgenlinear(T start, T step) CMT_NOEXCEPT : step{ step }, vstep{ step * VecWidth } { sync(start); } KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { - this->value = start + enumerate<T, width>() * step; + this->value = start + enumerate(vec_shape<T, VecWidth>{}, vstep / VecWidth); } KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += vstep; } -protected: - T step; T vstep; }; -template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)> -struct generator_exp : generator<T, width, generator_exp<T, width>> +template <typename T, size_t VecWidth = vector_capacity<T> / 8> +struct xgenexp : public xgenerator<T, VecWidth, xgenexp<T, VecWidth>> { - generator_exp(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp(make_vector(step* width))[0] - 1) + xgenexp(T start, T step) CMT_NOEXCEPT : step{ step }, + vstep{ exp(make_vector(step * VecWidth)).front() - 1 } { this->resync(start); } KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { - this->value = exp(start + enumerate<T, width>() * step); + this->value = exp(start + enumerate<T, VecWidth>() * step); } KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; } @@ -136,14 +137,14 @@ protected: T vstep; }; -template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), - std::enable_if_t<std::is_same<complex<deep_subtype<T>>, T>::value, int> = 0> -struct generator_expj : generator<T, width, generator_expj<T, width>> +template <typename T, size_t VecWidth = vector_capacity<deep_subtype<T>> / 8 / 2> +struct xgenexpj : public xgenerator<T, VecWidth, xgenexpj<T, VecWidth>> { using ST = deep_subtype<T>; + static_assert(std::is_same_v<complex<deep_subtype<T>>, T>, "xgenexpj requires complex type"); - generator_expj(ST start_, ST step_) - : step(step_), alpha(2 * sqr(sin(width * step / 2))), beta(-sin(width * step)) + xgenexpj(ST start_, ST step_) + : step(step_), alpha(2 * sqr(sin(VecWidth * step / 2))), beta(-sin(VecWidth * step)) { this->resync(T(start_)); } @@ -156,26 +157,26 @@ struct generator_expj : generator<T, width, generator_expj<T, width>> } protected: - ST const step; - ST const alpha; - ST const beta; - CMT_NOINLINE static vec<T, width> init_cossin(ST w, ST phase) + ST step; + ST alpha; + ST beta; + CMT_NOINLINE static vec<T, VecWidth> init_cossin(ST w, ST phase) { return ccomp(cossin(dup(phase + enumerate<ST, width>() * w))); } }; -template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)> -struct generator_exp2 : generator<T, width, generator_exp2<T, width>> +template <typename T, size_t VecWidth = vector_capacity<T> / 8> +struct xgenexp2 : public xgenerator<T, VecWidth, xgenexp2<T, VecWidth>> { - generator_exp2(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp2(make_vector(step* width))[0] - 1) + xgenexp2(T start, T step) CMT_NOEXCEPT : step{ step }, vstep{ exp2(make_vector(step * VecWidth))[0] - 1 } { this->resync(start); } KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { - this->value = exp2(start + enumerate<T, width>() * step); + this->value = exp2(start + enumerate(vec_shape<T, VecWidth>, step)); } KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; } @@ -185,11 +186,12 @@ protected: T vstep; }; -template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)> -struct generator_cossin : generator<T, width, generator_cossin<T, width>> +template <typename T, size_t VecWidth = vector_capacity<T> / 8> +struct xgencossin : public xgenerator<T, VecWidth, xgencossin<T, VecWidth>> { - generator_cossin(T start, T step) - : step(step), alpha(2 * sqr(sin(width / 2 * step / 2))), beta(-sin(width / 2 * step)) + static_assert(VecWidth % 2 == 0); + xgencossin(T start, T step) + : step(step), alpha(2 * sqr(sin(VecWidth / 2 * step / 2))), beta(-sin(VecWidth / 2 * step)) { this->resync(start); } @@ -204,56 +206,50 @@ protected: T step; T alpha; T beta; - CMT_NOINLINE static vec<T, width> init_cossin(T w, T phase) + CMT_NOINLINE static vec<T, VecWidth> init_cossin(T w, T phase) { - return cossin(dup(phase + enumerate<T, width / 2>() * w)); + return cossin(dup(phase + enumerate(vec_shape<T, VecWidth / 2>, w))); } }; -template <typename T, size_t width = vector_width<T>* bitness_const(2, 4)> -struct generator_sin : generator<T, width, generator_sin<T, width>> +template <typename T, size_t VecWidth = vector_capacity<T> / 8 / 2> +struct xgensin : public xgenerator<T, VecWidth, xgensin<T, VecWidth>, vec<T, 2>> { - generator_sin(T start, T step) - : step(step), alpha(2 * sqr(sin(width * step / 2))), beta(sin(width * step)) + xgensin(T start, T step) + : step(step), alpha(2 * sqr(sin(VecWidth * step / 2))), beta(sin(VecWidth * step)) { this->resync(start); } KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { - const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step))); - this->cos_value = low(cs); - this->value = high(cs); + const vec<T, 2 * VecWidth> cs = cossin(dup(start + enumerate(vec_shape<T, VecWidth>, step))); + this->value = vec<vec<T, 2>, VecWidth>::from_flatten(cs); } KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { - const vec<T, width> c = this->cos_value; - const vec<T, width> s = this->value; + const vec<T, 2 * VecWidth> cs = flatten(this->value); - const vec<T, width> cc = alpha * c + beta * s; - const vec<T, width> ss = alpha * s - beta * c; + cs = cs - addsub(alpha * cs, beta * swap<2>(cs)); - this->cos_value = c - cc; - this->value = s - ss; - } + this->value = vec<vec<T, 2>, VecWidth>::from_flatten(cs); - template <size_t N> - void shift(csize_t<N>) const CMT_NOEXCEPT - { - const vec<T, width> oldvalue = this->value; - const vec<T, width> oldcosvalue = this->cos_value; - next(); - this->value = slice<N, width>(oldvalue, this->value); - this->cos_value = slice<N, width>(oldcosvalue, this->cos_value); + // const vec<T, VecWidth> c = even(flatten(this->value)); + // const vec<T, VecWidth> s = odd(flatten(this->value)); + // const vec<T, VecWidth> cc = alpha * c + beta * s; + // const vec<T, VecWidth> ss = alpha * s - beta * c; + // this->cos_value = c - cc; + // this->value = s - ss; } protected: T step; T alpha; T beta; - mutable vec<T, width> cos_value; }; -} // namespace internal + +inline namespace CMT_ARCH_NAME +{ /** * @brief Returns template expression that generates values starting from the start and using the step as the @@ -264,9 +260,9 @@ protected: \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step) +KFR_FUNCTION xgenlinear<TF> gen_linear(T1 start, T2 step) { - return internal::generator_linear<TF>(start, step); + return xgenlinear<TF>(start, step); } /** @@ -276,9 +272,9 @@ KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step) +KFR_FUNCTION xgenexp<TF> gen_exp(T1 start, T2 step) { - return internal::generator_exp<TF>(start, step); + return xgenexp<TF>(start, step); } /** @@ -288,9 +284,9 @@ KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = complex<ftype<common_type<T1, T2>>>> -KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step) +KFR_FUNCTION xgenexpj<TF> gen_expj(T1 start, T2 step) { - return internal::generator_expj<TF>(start, step); + return xgenexpj<TF>(start, step); } /** @@ -300,9 +296,9 @@ KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step) +KFR_FUNCTION xgenexp2<TF> gen_exp2(T1 start, T2 step) { - return internal::generator_exp2<TF>(start, step); + return xgenexp2<TF>(start, step); } /** @@ -316,9 +312,9 @@ KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step) +KFR_FUNCTION xgencossin<TF> gen_cossin(T1 start, T2 step) { - return internal::generator_cossin<TF>(start, step); + return xgencossin<TF>(start, step); } /** @@ -328,9 +324,9 @@ KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step) \f] */ template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>> -KFR_FUNCTION internal::generator_sin<TF> gen_sin(T1 start, T2 step) +KFR_FUNCTION xgensin<TF> gen_sin(T1 start, T2 step) { - return internal::generator_sin<TF>(start, step); + return xgensin<TF>(start, step); } } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp @@ -38,6 +38,15 @@ struct expression_pointer; template <typename T> constexpr size_t maximum_expression_width = vector_width_for<T, cpu_t::highest> * 2; +template <typename T, template <T...> typename Tpl, typename Pack> +struct expand_cvals; + +template <typename T, template <T...> typename Tpl, T... vals> +struct expand_cvals<T, Tpl, cvals_t<T, vals...>> +{ + using type = Tpl<vals...>; +}; + inline namespace CMT_ARCH_NAME { @@ -50,64 +59,57 @@ KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& n } } // namespace CMT_ARCH_NAME -template <typename T, size_t N = maximum_expression_width<T>> -struct expression_vtable : expression_vtable<T, N / 2> +template <typename T, index_t Dims> +struct expression_vtable { - using func_get = void (*)(void*, size_t, vec<T, N>&); - func_get get; + constexpr static const size_t Nsizes = 1 + ilog2(maximum_expression_width<T>); - template <typename Expression> - expression_vtable(ctype_t<Expression> t) - : expression_vtable<T, N / 2>(t), get(&expression_vtable<T, N>::template static_get<Expression>) - { - } + using func_get = void (*)(void*, shape<Dims>, T*); + using func_set = void (*)(void*, shape<Dims>, const T*); + using func_shapeof = void (*)(void*, shape<Dims>&); + using func_substitute = bool (*)(void*, expression_pointer<T>&&); - template <typename Expression> - static void static_get(void* instance, size_t index, vec<T, N>& result) - { - result = get_elements(*static_cast<Expression*>(instance), cinput, index, vec_shape<T, N>()); - } -}; - -template <typename T> -struct expression_vtable<T, 0> -{ - using func_size = size_t (*)(void* p); - using func_begin_block = void (*)(void*, size_t); - using func_end_block = void (*)(void*, size_t); - using func_substitute = bool (*)(void*, expression_pointer<T>&&); - - func_size size; - func_begin_block begin_block; - func_end_block end_block; - func_substitute substitute; + func_shapeof fn_shapeof; + func_substitute fn_substitute; + std::array<std::array<func_get, Nsizes>, Dims> fn_get_elements; + std::array<std::array<func_set, Nsizes>, Dims> fn_set_elements; template <typename Expression> - expression_vtable(ctype_t<Expression>) - : size(&expression_vtable<T, 0>::template static_size<Expression>), - begin_block(&expression_vtable<T, 0>::template static_begin_block<Expression>), - end_block(&expression_vtable<T, 0>::template static_end_block<Expression>), - substitute(&expression_vtable<T, 0>::template static_substitute<Expression>) + expression_vtable(ctype_t<Expression> t) { + fn_shapeof = &static_shapeof<Expression>; + fn_substitute = &static_substitute<Expression>; + cforeach(csizeseq<Nsizes>, + [&](size_t size) CMT_INLINE_LAMBDA + { + cforeach(csizeseq<Dims>, + [&](size_t axis) CMT_INLINE_LAMBDA + { + fn_get_elements[axis][size] = + &expression_vtable::static_get_elements<Expression, 1 << size, axis>; + fn_set_elements[axis][size] = + &expression_vtable::static_set_elements<Expression, 1 << size, axis>; + }); + }); } - template <typename Expression> - static size_t static_size(void* instance) + template <typename Expression, size_t N, index_t VecAxis> + static void static_get_elements(void* instance, shape<Dims> index, T* dest) { - return static_cast<Expression*>(instance)->size(); + write(dest, get_elements(*static_cast<Expression*>(instance), index, axis_params_v<VecAxis, N>)); } - template <typename Expression> - static void static_begin_block(void* instance, size_t size) + template <typename Expression, size_t N, index_t VecAxis> + static void static_set_elements(void* instance, shape<Dims> index, const T* src) { - return static_cast<Expression*>(instance)->begin_block(cinput, size); + set_elements(*static_cast<Expression*>(instance), index, axis_params_v<VecAxis, N>, read<N>(src)); } template <typename Expression> - static void static_end_block(void* instance, size_t size) + static void static_shapeof(void* instance, shape<Dims>& result) { - return static_cast<Expression*>(instance)->end_block(cinput, size); + result = expression_traits<Expression>::shapeof(*static_cast<Expression*>(instance)); } template <typename Expression> - static bool static_substitute(void* instance, expression_pointer<T>&& ptr) + static bool static_substitute(void* instance, expression_pointer<T> ptr) { return internal::invoke_substitute(*static_cast<Expression*>(instance), std::move(ptr)); } @@ -143,51 +145,77 @@ KFR_INTRINSIC std::shared_ptr<expression_resource> make_resource(E&& e) new (aligned_allocate<T>()) T(std::move(e)), [](T* pi) { aligned_deallocate<T>(pi); })); } -template <typename T, bool enable_resource> -struct expression_pointer : input_expression +template <typename T, index_t Dims> +struct xpointer { - using value_type = T; + void* instance; + const expression_vtable<T, Dims>* vtable; + std::shared_ptr<expression_resource> resource; - expression_pointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {} - expression_pointer(const void* instance, const expression_vtable<T>* vtable, - std::shared_ptr<expression_resource> resource = nullptr) + xpointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {} + xpointer(const void* instance, const expression_vtable<T, Dims>* vtable, + std::shared_ptr<expression_resource> resource = nullptr) : instance(const_cast<void*>(instance)), vtable(vtable), resource(std::move(resource)) { } - template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width<T>)> - friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t, size_t index, - vec_shape<T, N>) + + explicit operator bool() const { return instance != nullptr; } +}; + +template <typename T, index_t Dims> +struct expression_traits<xpointer<T, Dims>> : expression_traits_defaults +{ + using value_type = T; + constexpr static size_t dims = Dims; + constexpr static shape<dims> shapeof(const xpointer<T, Dims>& self) { - static_assert(is_poweroftwo(N), "N must be a power of two"); - vec<T, N> result; - static_cast<const expression_vtable<T, N>*>(self.vtable)->get(self.instance, index, result); - return result; + shape<dims> result; + self.vtable->fn_shapeof(self.instance, result); } - template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width<T>)> - friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t cinput, size_t index, - vec_shape<T, N>) + constexpr static shape<dims> shapeof() { return shape<dims>(undefined_size); } + + constexpr static inline bool random_access = false; +}; + +inline namespace CMT_ARCH_NAME +{ + +template <typename T, index_t NDims, index_t Axis, size_t N> +KFR_INTRINSIC vec<T, N> get_elements(const xpointer<T, NDims>& self, const shape<NDims>& index, + const axis_params<Axis, N>& sh) +{ + static_assert(is_poweroftwo(N) && N >= 1); + if constexpr (N > expression_vtable<T, NDims>::Nmax) { - static_assert(is_poweroftwo(N), "N must be a power of two"); - const vec<T, N / 2> r1 = get_elements(self, cinput, index, vec_shape<T, N / 2>()); - const vec<T, N / 2> r2 = get_elements(self, cinput, index + N / 2, vec_shape<T, N / 2>()); - return concat(r1, r2); + constexpr size_t Nhalf = N / 2; + return concat(get_elements(self, index, axis_params_v<Axis, Nhalf>), + get_elements(self, index.add_at(Axis, Nhalf), axis_params_v<Axis, Nhalf>)); } - KFR_MEM_INTRINSIC void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); } - KFR_MEM_INTRINSIC void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); } - KFR_MEM_INTRINSIC size_t size() const { return vtable->size(instance); } - - KFR_MEM_INTRINSIC bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const + else { - return vtable->substitute(instance, std::move(new_pointer)); + portable_vec<T, N> result; + self.vtable->fn_get_elements[Axis][ilog2(N)](self.instance, index, result.elem); + return result; } +} - explicit operator bool() const { return instance != nullptr; } - -private: - void* instance; - const expression_vtable<T>* vtable; - std::shared_ptr<expression_resource> resource; -}; +template <typename T, index_t NDims, index_t Axis, size_t N> +KFR_INTRINSIC void set_elements(const xpointer<T, NDims>& self, const shape<NDims>& index, + const axis_params<Axis, N>& sh, const identity<vec<T, N>>& value) +{ + static_assert(is_poweroftwo(N) && N >= 1); + if constexpr (N > expression_vtable<T, NDims>::Nmax) + { + constexpr size_t Nhalf = N / 2; + set_elements(self, index, axis_params_v<Axis, Nhalf>, slice<0, Nhalf>(value)); + set_elements(self, index.add_at(Axis, Nhalf), axis_params_v<Axis, Nhalf>, slice<Nhalf, Nhalf>(value)); + } + else + { + self.vtable->fn_set_elements[Axis][ilog2(N)](self.instance, index, &value.front()); + } +} +} // namespace CMT_ARCH_NAME inline namespace CMT_ARCH_NAME { @@ -195,11 +223,10 @@ inline namespace CMT_ARCH_NAME namespace internal { -template <typename T, typename E> -KFR_INTRINSIC expression_vtable<T>* make_expression_vtable() +template <typename T, index_t Dims, typename E> +KFR_INTRINSIC expression_vtable<T, Dims>* make_expression_vtable() { - static_assert(is_input_expression<E>, "E must be an expression"); - static expression_vtable<T> vtable{ ctype_t<decay<E>>{} }; + static expression_vtable<T, Dims> vtable{ ctype_t<decay<E>>{} }; return &vtable; } } // namespace internal @@ -208,26 +235,25 @@ KFR_INTRINSIC expression_vtable<T>* make_expression_vtable() * This overload takes reference to the expression. * @warning Use with caution with local variables. */ -template <typename E, typename T = value_type_of<E>> -KFR_INTRINSIC expression_pointer<T> to_pointer(E& expr) +template <typename E, typename T = expression_value_type<E>, index_t Dims = expression_dims<E>> +KFR_INTRINSIC expression_pointer<T, Dims> to_pointer(E& expr) { - static_assert(is_input_expression<E>, "E must be an expression"); - return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, E>()); + return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, Dims, E>()); } /** @brief Converts the given expression into an opaque object. * This overload takes ownership of the expression (Move semantics). * @note Use std::move to force use of this overload. */ -template <typename E, typename T = value_type_of<E>> -KFR_INTRINSIC expression_pointer<T> to_pointer(E&& expr) +template <typename E, typename T = expression_value_type<E>, index_t Dims = expression_dims<E>> +KFR_INTRINSIC expression_pointer<T, Dims> to_pointer(E&& expr) { - static_assert(is_input_expression<E>, "E must be an expression"); std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr)); void* instance = ptr->instance(); - return expression_pointer<T>(instance, internal::make_expression_vtable<T, E>(), std::move(ptr)); + return expression_pointer<T, Dims>(instance, internal::make_expression_vtable<T, E>(), std::move(ptr)); } +#if 0 template <typename T, size_t key> class expression_placeholder : public input_expression { @@ -306,5 +332,7 @@ KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& n } } // namespace internal +#endif } // namespace CMT_ARCH_NAME -} // namespace kfr + +} diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp @@ -25,92 +25,14 @@ */ #pragma once -#include "../simd/impl/function.hpp" -#include "../simd/operators.hpp" -#include "../simd/shuffle.hpp" -#include "../simd/vec.hpp" -#include "expression.hpp" -#include <functional> - -#ifdef CMT_ARCH_ARM -#define KFR_DISABLE_READCYCLECOUNTER -#endif +#include "random_bits.hpp" namespace kfr { -#ifndef KFR_DISABLE_READCYCLECOUNTER -struct seed_from_rdtsc_t -{ -}; - -constexpr seed_from_rdtsc_t seed_from_rdtsc{}; -#endif - inline namespace CMT_ARCH_NAME { -using random_state = u32x4; - -#ifndef KFR_DISABLE_READCYCLECOUNTER -#ifdef CMT_COMPILER_CLANG -#define KFR_builtin_readcyclecounter() \ - static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here -#else -#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc()) -#endif -#endif - -struct random_bit_generator -{ -#ifndef KFR_DISABLE_READCYCLECOUNTER - KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT - : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(), - (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull))) - { - (void)operator()(); - } -#endif - KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT - : state(x0, x1, x2, x3) - { - (void)operator()(); - } - KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT - : state(bitcast<u32>(make_vector(x0, x1))) - { - (void)operator()(); - } - - KFR_MEM_INTRINSIC random_state operator()() - { - const static random_state mul{ 214013u, 17405u, 214013u, 69069u }; - const static random_state add{ 2531011u, 10395331u, 13737667u, 1u }; - state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add)))); - return state; - } - -protected: - random_state state; -}; - -static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16"); - -template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))> -KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen) -{ - return narrow<N>(bitcast<u8>(gen())); -} - -template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))> -KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen) -{ - constexpr size_t N2 = prev_poweroftwo(N - 1); - const vec<u8, N2> bits1 = random_bits<N2>(gen); - const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen); - return concat(bits1, bits2); -} - template <typename T, size_t N, KFR_ENABLE_IF(is_integral<T>)> KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen) { diff --git a/include/kfr/base/random_bits.hpp b/include/kfr/base/random_bits.hpp @@ -0,0 +1,114 @@ +/** @addtogroup random + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../simd/impl/function.hpp" +#include "../simd/operators.hpp" +#include "../simd/shuffle.hpp" +#include "../simd/vec.hpp" +#include "expression.hpp" +#include <functional> + +#ifdef CMT_ARCH_ARM +#define KFR_DISABLE_READCYCLECOUNTER +#endif + +namespace kfr +{ + +#ifndef KFR_DISABLE_READCYCLECOUNTER +struct seed_from_rdtsc_t +{ +}; + +constexpr seed_from_rdtsc_t seed_from_rdtsc{}; +#endif + +inline namespace CMT_ARCH_NAME +{ + +using random_state = u32x4; + +#ifndef KFR_DISABLE_READCYCLECOUNTER +#ifdef CMT_COMPILER_CLANG +#define KFR_builtin_readcyclecounter() \ + static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here +#else +#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc()) +#endif +#endif + +struct random_bit_generator +{ +#ifndef KFR_DISABLE_READCYCLECOUNTER + KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT + : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(), + (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull))) + { + (void)operator()(); + } +#endif + KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT + : state(x0, x1, x2, x3) + { + (void)operator()(); + } + KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT + : state(bitcast<u32>(make_vector(x0, x1))) + { + (void)operator()(); + } + + KFR_MEM_INTRINSIC random_state operator()() + { + const static random_state mul{ 214013u, 17405u, 214013u, 69069u }; + const static random_state add{ 2531011u, 10395331u, 13737667u, 1u }; + state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add)))); + return state; + } + +protected: + random_state state; +}; + +static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16"); + +template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))> +KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen) +{ + return narrow<N>(bitcast<u8>(gen())); +} + +template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))> +KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen) +{ + constexpr size_t N2 = prev_poweroftwo(N - 1); + const vec<u8, N2> bits1 = random_bits<N2>(gen); + const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen); + return concat(bits1, bits2); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp @@ -25,7 +25,7 @@ */ #pragma once -#include "../math/min_max.hpp" +#include "../simd/min_max.hpp" #include "../simd/horizontal.hpp" #include "../simd/impl/function.hpp" #include "../simd/operators.hpp" diff --git a/include/kfr/base/shape.hpp b/include/kfr/base/shape.hpp @@ -27,7 +27,7 @@ #include "impl/static_array.hpp" -#include "../math/min_max.hpp" +#include "../simd/min_max.hpp" #include "../simd/shuffle.hpp" #include "../simd/types.hpp" #include "../simd/vec.hpp" @@ -60,6 +60,8 @@ constexpr inline cindex_t<val> cindex{}; constexpr inline index_t infinite_size = max_index_t; +constexpr inline index_t undefined_size = 0; + constexpr inline index_t maximum_dims = 8; using dimset = vec<i8, maximum_dims>; @@ -142,20 +144,36 @@ struct shape : static_array_base<index_t, csizeseq_t<dims>> return false; } - shape add_inf(const shape& other) const + friend shape add_shape(const shape& lhs, const shape& rhs) { - vec<index_t, dims> x = **this; - vec<index_t, dims> y = *other; - mask<index_t, dims> inf = (x == infinite_size) || (y == infinite_size); + vec<index_t, dims> x = *lhs; + vec<index_t, dims> y = *rhs; + mask<index_t, dims> inf = max(x, y) == infinite_size; return select(inf, infinite_size, x + y); } - shape sub_inf(const shape& other) const + friend shape sub_shape(const shape& lhs, const shape& rhs) { - vec<index_t, dims> x = **this; - vec<index_t, dims> y = *other; - mask<index_t, dims> inf = (x == infinite_size) || (y == infinite_size); + vec<index_t, dims> x = *lhs; + vec<index_t, dims> y = *rhs; + mask<index_t, dims> inf = max(x, y) == infinite_size; return select(inf, infinite_size, x - y); } + friend shape add_shape_undef(const shape& lhs, const shape& rhs) + { + vec<index_t, dims> x = *lhs; + vec<index_t, dims> y = *rhs; + mask<index_t, dims> inf = max(x, y) == infinite_size; + mask<index_t, dims> undef = min(x, y) == undefined_size; + return select(inf, infinite_size, select(undef, undefined_size, x + y)); + } + friend shape sub_shape_undef(const shape& lhs, const shape& rhs) + { + vec<index_t, dims> x = *lhs; + vec<index_t, dims> y = *rhs; + mask<index_t, dims> inf = max(x, y) == infinite_size; + mask<index_t, dims> undef = min(x, y) == undefined_size; + return select(inf, infinite_size, select(undef, undefined_size, x - y)); + } friend shape min(const shape& x, const shape& y) { return kfr::min(*x, *y); } diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp @@ -1,103 +0,0 @@ -/** @addtogroup utility - * @{ - */ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../math/min_max.hpp" -#include "../simd/shuffle.hpp" -#include "../simd/vec.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -/** - * @brief Sort the elements in the vector in ascending order - * @param x input vector - * @return sorted vector - * @code - * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(-10, 1, 2, 1000)); - * @endcode - */ -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x) -{ - constexpr size_t Nhalf = N / 2; - vec<T, Nhalf> e = low(x); - vec<T, Nhalf> o = high(x); - constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); - for (size_t i = 0; i < Nhalf; i++) - { - vec<T, Nhalf> t; - t = min(e, o); - o = max(e, o); - o = rotateright<1>(o); - e = t; - t = max(e, o); - o = min(e, o); - e = t; - t = blend(e, o, blend0); - o = blend(o, e, blend0); - o = rotateleft<1>(o); - e = t; - } - return interleavehalves(concat(e, o)); -} - -/** - * @brief Sort the elements in the vector in descending order - * @param x input vector - * @return sorted vector - * @code - * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(1000, 2, 1, -10)); - * @endcode - */ -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x) -{ - constexpr size_t Nhalf = N / 2; - vec<T, Nhalf> e = low(x); - vec<T, Nhalf> o = high(x); - constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); - for (size_t i = 0; i < Nhalf; i++) - { - vec<T, Nhalf> t; - t = max(e, o); - o = min(e, o); - o = rotateright<1>(o); - e = t; - t = min(e, o); - o = max(e, o); - e = t; - t = blend(e, o, blend0); - o = blend(o, e, blend0); - o = rotateleft<1>(o); - e = t; - } - return interleavehalves(concat(e, o)); -} -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/base/tensor.hpp b/include/kfr/base/tensor.hpp @@ -29,8 +29,8 @@ #include "../cometa/array.hpp" -#include "../math/logical.hpp" -#include "../math/min_max.hpp" +#include "../simd/logical.hpp" +#include "../simd/min_max.hpp" #include "../simd/horizontal.hpp" #include "../simd/impl/function.hpp" #include "../simd/read_write.hpp" @@ -914,7 +914,7 @@ struct expression_traits<tensor<T, Dims>> : expression_traits_defaults { return self.shape(); } - KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ 0 }; } + KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ undefined_size }; } }; inline namespace CMT_ARCH_NAME diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp @@ -24,22 +24,15 @@ #include "simd.hpp" -#include "math/abs.hpp" #include "math/asin_acos.hpp" #include "math/atan.hpp" -#include "math/clamp.hpp" #include "math/compiletime.hpp" #include "math/complex_math.hpp" #include "math/gamma.hpp" #include "math/hyperbolic.hpp" #include "math/interpolation.hpp" #include "math/log_exp.hpp" -#include "math/logical.hpp" -#include "math/min_max.hpp" #include "math/modzerobessel.hpp" -#include "math/round.hpp" -#include "math/saturation.hpp" -#include "math/select.hpp" #include "math/sin_cos.hpp" #include "math/sqrt.hpp" #include "math/tan.hpp" diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp @@ -26,12 +26,12 @@ #pragma once #include "../simd/complex.hpp" -#include "abs.hpp" +#include "../simd/abs.hpp" #include "atan.hpp" #include "hyperbolic.hpp" #include "log_exp.hpp" -#include "min_max.hpp" -#include "select.hpp" +#include "../simd/min_max.hpp" +#include "../simd/select.hpp" #include "sin_cos.hpp" #include "sqrt.hpp" diff --git a/include/kfr/math/impl/abs.hpp b/include/kfr/math/impl/abs.hpp @@ -1,138 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../math/select.hpp" -#include "../../simd/impl/function.hpp" -#include "../../simd/operators.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS - -// floating point -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)> -KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT -{ - return x & special_constants<T>::invhighbitmask(); -} - -KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT -{ - const __m128i sh = _mm_srai_epi32(x.v, 31); - const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1)); - return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk); -} -KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); } -KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); } -KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); } -KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; } - -#if defined CMT_ARCH_AVX2 -KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT -{ - const __m256i sh = _mm256_srai_epi32(x.v, 31); - const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1)); - return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk); -} -KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); } -KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); } -KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); } -KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; } -#endif - -#if defined CMT_ARCH_AVX512 -KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); } -KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); } -KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); } -KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); } -KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; } -#endif - -KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>) - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); } -KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); } -KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); } -#if defined CMT_ARCH_NEON64 -KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); } -#else -KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); } -#endif - -KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; } -KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; } - -KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); } -#if defined CMT_ARCH_NEON64 -KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); } -#else -KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT -{ - return x & special_constants<f64>::invhighbitmask(); -} -#endif - -KFR_HANDLE_ALL_SIZES_1(abs) - -#else - -// floating point -template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)> -KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT -{ - return x & special_constants<T>::invhighbitmask(); -} - -// fallback -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> -KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT -{ - return select(x >= T(0), x, -x); -} -#endif -KFR_HANDLE_SCALAR(abs) -} // namespace intrinsics - -KFR_I_FN(abs) -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/math/impl/asin_acos.hpp b/include/kfr/math/impl/asin_acos.hpp @@ -22,10 +22,10 @@ */ #pragma once -#include "../../math/atan.hpp" -#include "../../math/select.hpp" -#include "../../math/sqrt.hpp" #include "../../simd/impl/function.hpp" +#include "../../simd/select.hpp" +#include "../atan.hpp" +#include "../sqrt.hpp" namespace kfr { diff --git a/include/kfr/math/impl/atan.hpp b/include/kfr/math/impl/atan.hpp @@ -21,9 +21,9 @@ See https://www.kfrlib.com for details. */ #pragma once -#include "../../math/abs.hpp" -#include "../../math/select.hpp" -#include "../../math/sin_cos.hpp" +#include "../../simd/abs.hpp" +#include "../../simd/select.hpp" +#include "../sin_cos.hpp" #include "../../simd/constants.hpp" #include "../../simd/impl/function.hpp" #include "../../simd/operators.hpp" diff --git a/include/kfr/math/impl/clamp.hpp b/include/kfr/math/impl/clamp.hpp @@ -1,55 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../math/min_max.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -template <typename T> -KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi) -{ - return max(min(x, hi), lo); -} - -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi) -{ - return max(min(x, hi), lo); -} - -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi) -{ - return max(min(x, hi), zerovector<T, N>()); -} -} // namespace intrinsics -KFR_I_FN(clamp) -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/math/impl/hyperbolic.hpp b/include/kfr/math/impl/hyperbolic.hpp @@ -22,10 +22,10 @@ */ #pragma once -#include "../../math/abs.hpp" -#include "../../math/log_exp.hpp" -#include "../../math/min_max.hpp" -#include "../../math/select.hpp" +#include "../../simd/abs.hpp" +#include "../log_exp.hpp" +#include "../../simd/min_max.hpp" +#include "../../simd/select.hpp" #include "../../simd/constants.hpp" #include "../../simd/impl/function.hpp" #include "../../simd/operators.hpp" diff --git a/include/kfr/math/impl/log_exp.hpp b/include/kfr/math/impl/log_exp.hpp @@ -22,11 +22,11 @@ */ #pragma once -#include "../../math/abs.hpp" -#include "../../math/clamp.hpp" -#include "../../math/min_max.hpp" -#include "../../math/round.hpp" -#include "../../math/select.hpp" +#include "../../simd/abs.hpp" +#include "../../simd/clamp.hpp" +#include "../../simd/min_max.hpp" +#include "../../simd/round.hpp" +#include "../../simd/select.hpp" #include "../../simd/constants.hpp" #include "../../simd/impl/function.hpp" #include "../../simd/operators.hpp" diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp @@ -1,284 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../math/abs.hpp" -#include "../../simd/impl/function.hpp" -#include "../../simd/operators.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -#if defined CMT_ARCH_SSE41 - -// horizontal OR -KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); } - -// horizontal AND -KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } -#endif - -#if defined CMT_ARCH_AVX -// horizontal OR -KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); } - -KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); } - -KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); } -KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); } - -// horizontal AND -KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); } - -KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); } - -KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } -KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } - -#if defined CMT_ARCH_AVX512 -// horizontal OR -KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); } -KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } -KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); } -KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); } - -// horizontal AND -KFR_INTRINSIC bool bittestall(const mf32avx512& x) -{ - return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); -} -KFR_INTRINSIC bool bittestall(const mf64avx512& x) -{ - return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); -} -KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } -KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } -KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); } -KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); } -KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } -KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } - -#endif - -#elif defined CMT_ARCH_SSE41 -KFR_INTRINSIC bool bittestany(const mf32sse& x) -{ - return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v); -} -KFR_INTRINSIC bool bittestany(const mf64sse& x) -{ - return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v); -} -KFR_INTRINSIC bool bittestall(const mf32sse& x) -{ - return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v); -} -KFR_INTRINSIC bool bittestall(const mf64sse& x) -{ - return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v); -} -#endif - -#if !defined CMT_ARCH_SSE41 - -KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); } -KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); } -KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); } -KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); } - -KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); } -KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); } -KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); } -KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); } -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestall(const mask<T, N>& a) -{ - return bittestall(expand_simd(a, bit<T>(true))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestall(const mask<T, N>& a) -{ - return bittestall(low(a)) && bittestall(high(a)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestany(const mask<T, N>& a) -{ - return bittestany(expand_simd(a, bit<T>(false))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestany(const mask<T, N>& a) -{ - return bittestany(low(a)) || bittestany(high(a)); -} - -#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC bool bittestall(const mu32neon& a) -{ - const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v)); - return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; -} - -KFR_INTRINSIC bool bittestany(const mu32neon& a) -{ - const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v)); - return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; -} -KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<bit<u32>>(a)); } - -KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<bit<u32>>(a)); } -KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<bit<u32>>(a)); } - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestall(const mask<T, N>& a) -{ - return bittestall(expand_simd(a, bit<T>(true))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestall(const mask<T, N>& a) -{ - return bittestall(low(a)) && bittestall(high(a)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> -KFR_INTRINSIC bool bittestany(const mask<T, N>& a) -{ - return bittestany(expand_simd(a, bit<T>(false))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> -KFR_INTRINSIC bool bittestany(const mask<T, N>& a) -{ - return bittestany(low(a)) || bittestany(high(a)); -} - -#else - -template <typename T, size_t N> -KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x) -{ - typename bitmask<N>::type val = 0; - for (size_t i = 0; i < N; i++) - { - val |= static_cast<int>(x[i]) << i; - } - return val; -} - -template <typename T, size_t N> -KFR_INTRINSIC bool bittestany(const mask<T, N>& x) -{ - return getmask(x).value; -} -template <typename T, size_t N> -KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y) -{ - return bittestany(x & y); -} - -template <typename T, size_t N> -KFR_INTRINSIC bool bittestall(const mask<T, N>& x) -{ - return !getmask(~x).value; -} -template <typename T, size_t N> -KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y) -{ - return !bittestany(~x & y); -} -#endif -} // namespace intrinsics -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/math/impl/min_max.hpp b/include/kfr/math/impl/min_max.hpp @@ -1,236 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../math/abs.hpp" -#include "../../math/select.hpp" -#include "../../simd/impl/function.hpp" -#include "../../simd/operators.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); } -KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); } -KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); } -KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); } - -KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); } -KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); } -KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); } -KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); } - -#if defined CMT_ARCH_AVX2 -KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); } -KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); } -KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); } -KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); } -KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); } - -KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); } -KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); } -KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); } -KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); } -KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); } - -#endif - -#if defined CMT_ARCH_AVX512 -KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); } -KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); } -KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); } -KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); } - -KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); } -KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); } -KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); } -KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); } -KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); } -KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); } -KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); } -KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); } -KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); } -KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); } -KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); } -KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); } -KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); } -KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); } - -KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); } -KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); } -KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); } -KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); } - -KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); } -KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); } -KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); } -KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); } -#else -KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); } -KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); } -KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); } -KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); } -KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); } -KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); } -KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); } -KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); } -#endif - -#if defined CMT_ARCH_AVX -KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); } -KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); } -KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); } -KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); } -#endif - -#if defined CMT_ARCH_SSE41 -KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); } -KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); } -KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); } -KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); } - -KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); } -KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); } -KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); } -KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); } -#else -KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); } -KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); } -KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); } -KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); } - -KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); } -KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); } -KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); } -KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); } - -#endif - -KFR_HANDLE_ALL_SIZES_2(min) -KFR_HANDLE_ALL_SIZES_2(max) - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); } -KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); } -KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); } -KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); } -KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); } -KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); } -KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); } -KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); } - -KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); } -KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); } -KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); } -KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); } -KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); } -KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); } -KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); } -KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); } - -KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); } -KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); } -#if defined CMT_ARCH_NEON64 -KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); } -KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); } -#else -KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); } -KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); } -#endif - -KFR_HANDLE_ALL_SIZES_2(min) -KFR_HANDLE_ALL_SIZES_2(max) - -#else - -// fallback -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y) -{ - return select(x < y, x, y); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y) -{ - return select(x > y, x, y); -} -#endif - -template <typename T> -KFR_INTRINSIC T min(initialvalue<T>) -{ - return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() - : std::numeric_limits<T>::max(); -} -template <typename T> -KFR_INTRINSIC T max(initialvalue<T>) -{ - return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity() - : std::numeric_limits<T>::min(); -} -template <typename T> -KFR_INTRINSIC T absmin(initialvalue<T>) -{ - return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() - : std::numeric_limits<T>::max(); -} -template <typename T> -KFR_INTRINSIC T absmax(initialvalue<T>) -{ - return 0; -} - -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y) -{ - return min(abs(x), abs(y)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y) -{ - return max(abs(x), abs(y)); -} - -KFR_HANDLE_SCALAR(min) -KFR_HANDLE_SCALAR(max) -KFR_HANDLE_SCALAR(absmin) -KFR_HANDLE_SCALAR(absmax) -} // namespace intrinsics -KFR_I_FN(min) -KFR_I_FN(max) -KFR_I_FN(absmin) -KFR_I_FN(absmax) -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/math/impl/round.hpp b/include/kfr/math/impl/round.hpp @@ -1,282 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../simd/impl/function.hpp" -#include "../../simd/operators.hpp" -#include "abs.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_ss(V) \ - _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm_roundnearest_sd(V) \ - _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V)) -#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V)) -#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V)) -#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V)) - -#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) -#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) -#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) - -#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); } -KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); } -KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); } -KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); } -KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); } -KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); } -KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); } -KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); } -KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); } -KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); } - -#if defined CMT_ARCH_AVX - -KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); } -KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); } -KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); } -KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); } -KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); } -KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); } -KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); } -KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); } -KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); } -KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); } - -#endif - -#if defined CMT_ARCH_AVX512 - -KFR_INTRINSIC f32avx512 floor(const f32avx512& value) -{ - return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f32avx512 ceil(const f32avx512& value) -{ - return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f32avx512 trunc(const f32avx512& value) -{ - return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f32avx512 round(const f32avx512& value) -{ - return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f64avx512 floor(const f64avx512& value) -{ - return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f64avx512 ceil(const f64avx512& value) -{ - return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f64avx512 trunc(const f64avx512& value) -{ - return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f64avx512 round(const f64avx512& value) -{ - return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); -} -KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); } -KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); } -#endif - -KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>) -KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>) -KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>) -KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>) -KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>) - -#else - -// fallback - -template <typename T> -constexpr inline T fp_precision_limit = 4503599627370496.0; -template <> -constexpr inline f32 fp_precision_limit<f32> = 16777216.0f; - -template <size_t N> -KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x) -{ - vec<f32, N> t = innercast<f32>(innercast<i32>(x)); - return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f)); -} -template <size_t N> -KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x) -{ - vec<f64, N> t = innercast<f64>(innercast<i64>(x)); - return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.)); -} -template <size_t N> -KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x) -{ - vec<f32, N> t = innercast<f32>(innercast<i32>(x)); - return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f)); -} -template <size_t N> -KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x) -{ - vec<f64, N> t = innercast<f64>(innercast<i64>(x)); - return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.)); -} -template <size_t N> -KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x) -{ - return select(abs(x) >= fp_precision_limit<f32>, x, - innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x)))); -} -template <size_t N> -KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x) -{ - return select(abs(x) >= fp_precision_limit<f64>, x, - innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x)))); -} -template <size_t N> -KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x) -{ - return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x))); -} -template <size_t N> -KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x) -{ - return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x))); -} -template <size_t N> -KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x) -{ - return x - floor(x); -} -template <size_t N> -KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x) -{ - return x - floor(x); -} -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> -KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> -KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> -KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> -KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> -KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&) -{ - return T(0); -} - -template <typename T, size_t N, typename IT = itype<T>> -KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value) -{ - return innercast<IT>(floor(value)); -} -template <typename T, size_t N, typename IT = itype<T>> -KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value) -{ - return innercast<IT>(ceil(value)); -} -template <typename T, size_t N, typename IT = itype<T>> -KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value) -{ - return innercast<IT>(trunc(value)); -} -template <typename T, size_t N, typename IT = itype<T>> -KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value) -{ - return innercast<IT>(round(value)); -} - -KFR_HANDLE_SCALAR(floor) -KFR_HANDLE_SCALAR(ceil) -KFR_HANDLE_SCALAR(round) -KFR_HANDLE_SCALAR(trunc) -KFR_HANDLE_SCALAR(fract) -KFR_HANDLE_SCALAR(ifloor) -KFR_HANDLE_SCALAR(iceil) -KFR_HANDLE_SCALAR(iround) -KFR_HANDLE_SCALAR(itrunc) -} // namespace intrinsics -KFR_I_FN(floor) -KFR_I_FN(ceil) -KFR_I_FN(round) -KFR_I_FN(trunc) -KFR_I_FN(fract) -KFR_I_FN(ifloor) -KFR_I_FN(iceil) -KFR_I_FN(iround) -KFR_I_FN(itrunc) -} // namespace CMT_ARCH_NAME -} // namespace kfr - -#undef KFR_mm_trunc_ps -#undef KFR_mm_roundnearest_ps -#undef KFR_mm_trunc_pd -#undef KFR_mm_roundnearest_pd -#undef KFR_mm_trunc_ss -#undef KFR_mm_roundnearest_ss -#undef KFR_mm_trunc_sd -#undef KFR_mm_roundnearest_sd -#undef KFR_mm_floor_ss -#undef KFR_mm_floor_sd -#undef KFR_mm_ceil_ss -#undef KFR_mm_ceil_sd -#undef KFR_mm256_trunc_ps -#undef KFR_mm256_roundnearest_ps -#undef KFR_mm256_trunc_pd -#undef KFR_mm256_roundnearest_pd diff --git a/include/kfr/math/impl/saturation.hpp b/include/kfr/math/impl/saturation.hpp @@ -1,205 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../math/select.hpp" -#include "../../simd/impl/function.hpp" -#include "../../simd/operators.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -// Generic functions -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b) -{ - using UT = utype<T>; - constexpr size_t shift = typebits<UT>::bits - 1; - vec<UT, N> aa = bitcast<UT>(a); - vec<UT, N> bb = bitcast<UT>(b); - const vec<UT, N> sum = aa + bb; - aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); - - return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b) -{ - using UT = utype<T>; - constexpr size_t shift = typebits<UT>::bits - 1; - vec<UT, N> aa = bitcast<UT>(a); - vec<UT, N> bb = bitcast<UT>(b); - const vec<UT, N> diff = aa - bb; - aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); - - return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b) -{ - const vec<T, N> t = allonesvector(a); - return select(a > t - b, t, a + b); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b) -{ - return select(a < b, zerovector(a), a - b); -} - -#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); } -KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); } -KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); } -KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); } - -KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); } -KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); } -KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); } -KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); } - -KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); } -KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); } -KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); } -KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); } - -KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); } -KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); } -KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); } -KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); } - -#if defined CMT_ARCH_AVX2 -KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); } -KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); } -KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); } - -KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); } -KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); } -KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); } - -KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); } -KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); } -KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); } -KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); } - -KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); } -KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); } -KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); } -KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); } -#endif - -#if defined CMT_ARCH_AVX512 -KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); } -KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); } -KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); } -KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); } -KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); } -KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); } -KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); } - -KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); } -KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); } -KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b) -{ - return saturated_unsigned_add(a, b); -} -KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b) -{ - return saturated_unsigned_add(a, b); -} -KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); } -KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); } -KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b) -{ - return saturated_unsigned_sub(a, b); -} -KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b) -{ - return saturated_unsigned_sub(a, b); -} -#endif - -KFR_HANDLE_ALL_SIZES_2(satadd) -KFR_HANDLE_ALL_SIZES_2(satsub) - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); } -KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); } -KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); } -KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); } -KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); } -KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); } -KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); } -KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); } - -KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); } -KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); } -KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); } -KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); } -KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); } -KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); } -KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); } -KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); } - -KFR_HANDLE_ALL_SIZES_2(satadd) -KFR_HANDLE_ALL_SIZES_2(satsub) - -#else -// fallback -template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)> -KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_signed_add(a, b); -} -template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)> -KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_unsigned_add(a, b); -} -template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)> -KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_signed_sub(a, b); -} -template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)> -KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) -{ - return saturated_unsigned_sub(a, b); -} -#endif -KFR_HANDLE_SCALAR(satadd) -KFR_HANDLE_SCALAR(satsub) -} // namespace intrinsics -KFR_I_FN(satadd) -KFR_I_FN(satsub) -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp @@ -1,331 +0,0 @@ -/* - Copyright (C) 2016 D Levin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../simd/impl/function.hpp" -#include "../../simd/operators.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y) -{ - return _mm_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y) -{ - return _mm_blendv_ps(y.v, x.v, m.v); -} -KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y) -{ - return _mm_blendv_pd(y.v, x.v, m.v); -} - -#if defined CMT_ARCH_AVX -KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y) -{ - return _mm256_blendv_pd(y.v, x.v, m.v); -} -KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y) -{ - return _mm256_blendv_ps(y.v, x.v, m.v); -} -#endif - -#if defined CMT_ARCH_AVX2 -KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y) -{ - return _mm256_blendv_epi8(y.v, x.v, m.v); -} -#endif - -#if defined CMT_ARCH_AVX512 -KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y) -{ - return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v); -} -KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y) -{ - return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v); -} -KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y) -{ - return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y) -{ - return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y) -{ - return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y) -{ - return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y) -{ - return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y) -{ - return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y) -{ - return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); -} -KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y) -{ - return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); -} -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - constexpr size_t Nout = next_simd_width<T>(N); - return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) - .shuffle(csizeseq<N>); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c))); - // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) -{ - constexpr size_t Nout = next_simd_width<T>(N); - return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) -{ - return concat2(select(a.h.low, b, c), select(a.h.high, b, c)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) -{ - constexpr size_t Nout = next_simd_width<T>(N); - return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) -{ - return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c)); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) -{ - constexpr size_t Nout = next_simd_width<T>(N); - return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>)) - .shuffle(csizeseq<N>); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) -{ - return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high)); -} - -#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS - -KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y) -{ - return vbslq_f32(m.v, x.v, y.v); -} -KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y) -{ - return vbslq_s8(m.v, x.v, y.v); -} -KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y) -{ - return vbslq_u8(m.v, x.v, y.v); -} -KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y) -{ - return vbslq_s16(m.v, x.v, y.v); -} -KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y) -{ - return vbslq_u16(m.v, x.v, y.v); -} -KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y) -{ - return vbslq_s32(m.v, x.v, y.v); -} -KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y) -{ - return vbslq_u32(m.v, x.v, y.v); -} -KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y) -{ - return vbslq_s64(m.v, x.v, y.v); -} -KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y) -{ - return vbslq_u64(m.v, x.v, y.v); -} - -#ifdef CMT_ARCH_NEON64 -KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) -{ - return vbslq_f64(m.v, x.v, y.v); -} -#else -KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) -{ - return y ^ ((x ^ y) & m.asvec()); -} -#endif - -template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - constexpr size_t Nout = next_simd_width<T>(N); - return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) - .shuffle(csizeseq<N>); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) -{ - return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) -{ - return select(m, vec<T, N>(x), vec<T, N>(y)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y) -{ - return select(m, x, vec<T, N>(y)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y) -{ - return select(m, vec<T, N>(x), y); -} - -#else - -// fallback -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y) -{ - return y ^ ((x ^ y) & m.asvec()); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) -{ - return select(m, vec<T, N>(x), vec<T, N>(y)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y) -{ - return select(m, x, vec<T, N>(y)); -} -template <typename T, size_t N> -KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y) -{ - return select(m, vec<T, N>(x), y); -} -#endif -template <typename T1, typename T2> -KFR_INTRINSIC common_type<T1, T2> select(bool m, const T1& x, const T2& y) -{ - return m ? x : y; -} - -} // namespace intrinsics -KFR_I_FN(select) -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/math/impl/sin_cos.hpp b/include/kfr/math/impl/sin_cos.hpp @@ -22,10 +22,10 @@ */ #pragma once -#include "../../math/abs.hpp" -#include "../../math/min_max.hpp" -#include "../../math/round.hpp" -#include "../../math/select.hpp" +#include "../../simd/abs.hpp" +#include "../../simd/min_max.hpp" +#include "../../simd/round.hpp" +#include "../../simd/select.hpp" #include "../../simd/constants.hpp" #include "../../simd/impl/function.hpp" #include "../../simd/operators.hpp" diff --git a/include/kfr/math/impl/tan.hpp b/include/kfr/math/impl/tan.hpp @@ -22,12 +22,12 @@ */ #pragma once -#include "../../math/abs.hpp" -#include "../../math/select.hpp" -#include "../../math/sin_cos.hpp" +#include "../../simd/abs.hpp" #include "../../simd/constants.hpp" #include "../../simd/impl/function.hpp" #include "../../simd/operators.hpp" +#include "../../simd/select.hpp" +#include "../sin_cos.hpp" namespace kfr { diff --git a/include/kfr/math/interpolation.hpp b/include/kfr/math/interpolation.hpp @@ -25,7 +25,7 @@ */ #pragma once -#include "select.hpp" +#include "../simd/select.hpp" #include "sin_cos.hpp" namespace kfr diff --git a/include/kfr/simd.hpp b/include/kfr/simd.hpp @@ -22,15 +22,23 @@ */ #pragma once +#include "simd/abs.hpp" +#include "simd/clamp.hpp" #include "simd/comparison.hpp" #include "simd/complex.hpp" #include "simd/constants.hpp" #include "simd/digitreverse.hpp" #include "simd/horizontal.hpp" +#include "simd/logical.hpp" #include "simd/mask.hpp" +#include "simd/min_max.hpp" #include "simd/operators.hpp" #include "simd/platform.hpp" #include "simd/read_write.hpp" +#include "simd/round.hpp" +#include "simd/saturation.hpp" +#include "simd/select.hpp" #include "simd/shuffle.hpp" +#include "simd/sort.hpp" #include "simd/types.hpp" #include "simd/vec.hpp" diff --git a/include/kfr/math/abs.hpp b/include/kfr/simd/abs.hpp diff --git a/include/kfr/math/clamp.hpp b/include/kfr/simd/clamp.hpp diff --git a/include/kfr/simd/impl/abs.hpp b/include/kfr/simd/impl/abs.hpp @@ -0,0 +1,138 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../select.hpp" +#include "function.hpp" +#include "../operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS + +// floating point +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)> +KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT +{ + return x & special_constants<T>::invhighbitmask(); +} + +KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT +{ + const __m128i sh = _mm_srai_epi32(x.v, 31); + const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1)); + return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk); +} +KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); } +KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); } +KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); } +KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; } + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT +{ + const __m256i sh = _mm256_srai_epi32(x.v, 31); + const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1)); + return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk); +} +KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); } +KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); } +KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); } +KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; } +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); } +KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); } +KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); } +KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); } +KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; } +#endif + +KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>) + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); } +KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); } +KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); } +#if defined CMT_ARCH_NEON64 +KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); } +#else +KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); } +#endif + +KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; } +KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; } + +KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); } +#if defined CMT_ARCH_NEON64 +KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); } +#else +KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT +{ + return x & special_constants<f64>::invhighbitmask(); +} +#endif + +KFR_HANDLE_ALL_SIZES_1(abs) + +#else + +// floating point +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)> +KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT +{ + return x & special_constants<T>::invhighbitmask(); +} + +// fallback +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> +KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT +{ + return select(x >= T(0), x, -x); +} +#endif +KFR_HANDLE_SCALAR(abs) +} // namespace intrinsics + +KFR_I_FN(abs) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/clamp.hpp b/include/kfr/simd/impl/clamp.hpp @@ -0,0 +1,55 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../min_max.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T> +KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi) +{ + return max(min(x, hi), lo); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi) +{ + return max(min(x, hi), lo); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi) +{ + return max(min(x, hi), zerovector<T, N>()); +} +} // namespace intrinsics +KFR_I_FN(clamp) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/logical.hpp b/include/kfr/simd/impl/logical.hpp @@ -0,0 +1,284 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../abs.hpp" +#include "function.hpp" +#include "../operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +#if defined CMT_ARCH_SSE41 + +// horizontal OR +KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); } + +// horizontal AND +KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); } +#endif + +#if defined CMT_ARCH_AVX +// horizontal OR +KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); } + +KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); } + +KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); } +KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); } + +// horizontal AND +KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); } + +KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); } + +KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } +KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); } + +#if defined CMT_ARCH_AVX512 +// horizontal OR +KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); } +KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); } +KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); } +KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); } + +// horizontal AND +KFR_INTRINSIC bool bittestall(const mf32avx512& x) +{ + return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); +} +KFR_INTRINSIC bool bittestall(const mf64avx512& x) +{ + return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); +} +KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); } +KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); } +KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); } + +#endif + +#elif defined CMT_ARCH_SSE41 +KFR_INTRINSIC bool bittestany(const mf32sse& x) +{ + return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v); +} +KFR_INTRINSIC bool bittestany(const mf64sse& x) +{ + return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v); +} +KFR_INTRINSIC bool bittestall(const mf32sse& x) +{ + return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v); +} +KFR_INTRINSIC bool bittestall(const mf64sse& x) +{ + return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v); +} +#endif + +#if !defined CMT_ARCH_SSE41 + +KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); } +KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); } +KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); } +KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); } + +KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); } +KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); } +KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); } +KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); } +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) +{ + return bittestall(expand_simd(a, bit<T>(true))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) +{ + return bittestall(low(a)) && bittestall(high(a)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) +{ + return bittestany(expand_simd(a, bit<T>(false))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) +{ + return bittestany(low(a)) || bittestany(high(a)); +} + +#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC bool bittestall(const mu32neon& a) +{ + const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v)); + return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; +} + +KFR_INTRINSIC bool bittestany(const mu32neon& a) +{ + const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v)); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; +} +KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<bit<u32>>(a)); } + +KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<bit<u32>>(a)); } +KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<bit<u32>>(a)); } + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) +{ + return bittestall(expand_simd(a, bit<T>(true))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestall(const mask<T, N>& a) +{ + return bittestall(low(a)) && bittestall(high(a)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)> +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) +{ + return bittestany(expand_simd(a, bit<T>(false))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void> +KFR_INTRINSIC bool bittestany(const mask<T, N>& a) +{ + return bittestany(low(a)) || bittestany(high(a)); +} + +#else + +template <typename T, size_t N> +KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x) +{ + typename bitmask<N>::type val = 0; + for (size_t i = 0; i < N; i++) + { + val |= static_cast<int>(x[i]) << i; + } + return val; +} + +template <typename T, size_t N> +KFR_INTRINSIC bool bittestany(const mask<T, N>& x) +{ + return getmask(x).value; +} +template <typename T, size_t N> +KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y) +{ + return bittestany(x & y); +} + +template <typename T, size_t N> +KFR_INTRINSIC bool bittestall(const mask<T, N>& x) +{ + return !getmask(~x).value; +} +template <typename T, size_t N> +KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y) +{ + return !bittestany(~x & y); +} +#endif +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/min_max.hpp b/include/kfr/simd/impl/min_max.hpp @@ -0,0 +1,236 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../abs.hpp" +#include "../select.hpp" +#include "function.hpp" +#include "../operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); } +KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); } +KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); } +KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); } + +KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); } +KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); } +KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); } +KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); } + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); } + +KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); } + +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); } +KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); } +KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); } + +KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); } +KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); } +KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); } +KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); } +KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); } +KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); } +KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); } +KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); } + +KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); } +KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); } +KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); } + +KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); } +KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); } +KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); } +#else +KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); } +KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); } +KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); } +KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); } +#endif + +#if defined CMT_ARCH_AVX +KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); } +KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); } +KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); } +KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); } +#endif + +#if defined CMT_ARCH_SSE41 +KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); } +KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); } +KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); } + +KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); } +KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); } +KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); } +#else +KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); } +KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); } + +KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); } +KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); } + +#endif + +KFR_HANDLE_ALL_SIZES_2(min) +KFR_HANDLE_ALL_SIZES_2(max) + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); } +KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); } +KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); } +KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); } +KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); } +KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); } +KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); } +KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); } + +KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); } +KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); } +KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); } +KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); } +KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); } +KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); } +KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); } +KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); } + +KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); } +KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); } +#if defined CMT_ARCH_NEON64 +KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); } +KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); } +#else +KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); } +KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); } +#endif + +KFR_HANDLE_ALL_SIZES_2(min) +KFR_HANDLE_ALL_SIZES_2(max) + +#else + +// fallback +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y) +{ + return select(x < y, x, y); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y) +{ + return select(x > y, x, y); +} +#endif + +template <typename T> +KFR_INTRINSIC T min(initialvalue<T>) +{ + return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::max(); +} +template <typename T> +KFR_INTRINSIC T max(initialvalue<T>) +{ + return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::min(); +} +template <typename T> +KFR_INTRINSIC T absmin(initialvalue<T>) +{ + return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::max(); +} +template <typename T> +KFR_INTRINSIC T absmax(initialvalue<T>) +{ + return 0; +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y) +{ + return min(abs(x), abs(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y) +{ + return max(abs(x), abs(y)); +} + +KFR_HANDLE_SCALAR(min) +KFR_HANDLE_SCALAR(max) +KFR_HANDLE_SCALAR(absmin) +KFR_HANDLE_SCALAR(absmax) +} // namespace intrinsics +KFR_I_FN(min) +KFR_I_FN(max) +KFR_I_FN(absmin) +KFR_I_FN(absmax) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/round.hpp b/include/kfr/simd/impl/round.hpp @@ -0,0 +1,282 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "function.hpp" +#include "../operators.hpp" +#include "abs.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) + +#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_ss(V) \ + _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm_roundnearest_sd(V) \ + _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) + +#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V)) +#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V)) +#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V)) +#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V)) + +#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) +#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) +#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) + +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); } +KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); } +KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); } +KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); } +KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); } +KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); } +KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); } +KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); } +KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); } +KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); } + +#if defined CMT_ARCH_AVX + +KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); } +KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); } +KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); } +KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); } +KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); } +KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); } +KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); } +KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); } +KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); } +KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); } + +#endif + +#if defined CMT_ARCH_AVX512 + +KFR_INTRINSIC f32avx512 floor(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 ceil(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 trunc(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 round(const f32avx512& value) +{ + return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 floor(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 ceil(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 trunc(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f64avx512 round(const f64avx512& value) +{ + return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} +KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); } +KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); } +#endif + +KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>) +KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>) +KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>) +KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>) +KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>) + +#else + +// fallback + +template <typename T> +constexpr inline T fp_precision_limit = 4503599627370496.0; +template <> +constexpr inline f32 fp_precision_limit<f32> = 16777216.0f; + +template <size_t N> +KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x) +{ + vec<f32, N> t = innercast<f32>(innercast<i32>(x)); + return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f)); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x) +{ + vec<f64, N> t = innercast<f64>(innercast<i64>(x)); + return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.)); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x) +{ + vec<f32, N> t = innercast<f32>(innercast<i32>(x)); + return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f)); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x) +{ + vec<f64, N> t = innercast<f64>(innercast<i64>(x)); + return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.)); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f32>, x, + innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x)))); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f64>, x, + innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x)))); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x))); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x) +{ + return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x))); +} +template <size_t N> +KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x) +{ + return x - floor(x); +} +template <size_t N> +KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x) +{ + return x - floor(x); +} +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> +KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> +KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> +KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> +KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> +KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&) +{ + return T(0); +} + +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value) +{ + return innercast<IT>(floor(value)); +} +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value) +{ + return innercast<IT>(ceil(value)); +} +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value) +{ + return innercast<IT>(trunc(value)); +} +template <typename T, size_t N, typename IT = itype<T>> +KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value) +{ + return innercast<IT>(round(value)); +} + +KFR_HANDLE_SCALAR(floor) +KFR_HANDLE_SCALAR(ceil) +KFR_HANDLE_SCALAR(round) +KFR_HANDLE_SCALAR(trunc) +KFR_HANDLE_SCALAR(fract) +KFR_HANDLE_SCALAR(ifloor) +KFR_HANDLE_SCALAR(iceil) +KFR_HANDLE_SCALAR(iround) +KFR_HANDLE_SCALAR(itrunc) +} // namespace intrinsics +KFR_I_FN(floor) +KFR_I_FN(ceil) +KFR_I_FN(round) +KFR_I_FN(trunc) +KFR_I_FN(fract) +KFR_I_FN(ifloor) +KFR_I_FN(iceil) +KFR_I_FN(iround) +KFR_I_FN(itrunc) +} // namespace CMT_ARCH_NAME +} // namespace kfr + +#undef KFR_mm_trunc_ps +#undef KFR_mm_roundnearest_ps +#undef KFR_mm_trunc_pd +#undef KFR_mm_roundnearest_pd +#undef KFR_mm_trunc_ss +#undef KFR_mm_roundnearest_ss +#undef KFR_mm_trunc_sd +#undef KFR_mm_roundnearest_sd +#undef KFR_mm_floor_ss +#undef KFR_mm_floor_sd +#undef KFR_mm_ceil_ss +#undef KFR_mm_ceil_sd +#undef KFR_mm256_trunc_ps +#undef KFR_mm256_roundnearest_ps +#undef KFR_mm256_trunc_pd +#undef KFR_mm256_roundnearest_pd diff --git a/include/kfr/simd/impl/saturation.hpp b/include/kfr/simd/impl/saturation.hpp @@ -0,0 +1,205 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../select.hpp" +#include "function.hpp" +#include "../operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +// Generic functions +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b) +{ + using UT = utype<T>; + constexpr size_t shift = typebits<UT>::bits - 1; + vec<UT, N> aa = bitcast<UT>(a); + vec<UT, N> bb = bitcast<UT>(b); + const vec<UT, N> sum = aa + bb; + aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); + + return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b) +{ + using UT = utype<T>; + constexpr size_t shift = typebits<UT>::bits - 1; + vec<UT, N> aa = bitcast<UT>(a); + vec<UT, N> bb = bitcast<UT>(b); + const vec<UT, N> diff = aa - bb; + aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max()); + + return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b) +{ + const vec<T, N> t = allonesvector(a); + return select(a > t - b, t, a + b); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b) +{ + return select(a < b, zerovector(a), a - b); +} + +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); } +KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); } +KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); } + +KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); } +KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); } +KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); } +KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); } + +KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); } +KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); } + +KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); } +KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); } + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); } + +KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); } + +KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); } +KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); } + +KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); } +KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); } +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); } +KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); } +KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); } +KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); } +KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); } + +KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); } +KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b) +{ + return saturated_unsigned_add(a, b); +} +KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b) +{ + return saturated_unsigned_add(a, b); +} +KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); } +KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b) +{ + return saturated_unsigned_sub(a, b); +} +KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b) +{ + return saturated_unsigned_sub(a, b); +} +#endif + +KFR_HANDLE_ALL_SIZES_2(satadd) +KFR_HANDLE_ALL_SIZES_2(satsub) + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); } +KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); } +KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); } +KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); } +KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); } +KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); } +KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); } +KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); } + +KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); } +KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); } +KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); } +KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); } +KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); } +KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); } +KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); } +KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); } + +KFR_HANDLE_ALL_SIZES_2(satadd) +KFR_HANDLE_ALL_SIZES_2(satsub) + +#else +// fallback +template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)> +KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_signed_add(a, b); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)> +KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_unsigned_add(a, b); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)> +KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_signed_sub(a, b); +} +template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)> +KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) +{ + return saturated_unsigned_sub(a, b); +} +#endif +KFR_HANDLE_SCALAR(satadd) +KFR_HANDLE_SCALAR(satsub) +} // namespace intrinsics +KFR_I_FN(satadd) +KFR_I_FN(satsub) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/impl/select.hpp b/include/kfr/simd/impl/select.hpp @@ -0,0 +1,331 @@ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "function.hpp" +#include "../operators.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y) +{ + return _mm_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y) +{ + return _mm_blendv_ps(y.v, x.v, m.v); +} +KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y) +{ + return _mm_blendv_pd(y.v, x.v, m.v); +} + +#if defined CMT_ARCH_AVX +KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y) +{ + return _mm256_blendv_pd(y.v, x.v, m.v); +} +KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y) +{ + return _mm256_blendv_ps(y.v, x.v, m.v); +} +#endif + +#if defined CMT_ARCH_AVX2 +KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y) +{ + return _mm256_blendv_epi8(y.v, x.v, m.v); +} +#endif + +#if defined CMT_ARCH_AVX512 +KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y) +{ + return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v); +} +KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y) +{ + return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v); +} +KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y) +{ + return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y) +{ + return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y) +{ + return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y) +{ + return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y) +{ + return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y) +{ + return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y) +{ + return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v); +} +KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y) +{ + return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v); +} +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) + .shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c))); + // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) +{ + return concat2(select(a.h.low, b, c), select(a.h.high, b, c)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) +{ + return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>)) + .shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) +{ + return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high)); +} + +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS + +KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y) +{ + return vbslq_f32(m.v, x.v, y.v); +} +KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y) +{ + return vbslq_s8(m.v, x.v, y.v); +} +KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y) +{ + return vbslq_u8(m.v, x.v, y.v); +} +KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y) +{ + return vbslq_s16(m.v, x.v, y.v); +} +KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y) +{ + return vbslq_u16(m.v, x.v, y.v); +} +KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y) +{ + return vbslq_s32(m.v, x.v, y.v); +} +KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y) +{ + return vbslq_u32(m.v, x.v, y.v); +} +KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y) +{ + return vbslq_s64(m.v, x.v, y.v); +} +KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y) +{ + return vbslq_u64(m.v, x.v, y.v); +} + +#ifdef CMT_ARCH_NEON64 +KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) +{ + return vbslq_f64(m.v, x.v, y.v); +} +#else +KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) +{ + return y ^ ((x ^ y) & m.asvec()); +} +#endif + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + constexpr size_t Nout = next_simd_width<T>(N); + return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>)) + .shuffle(csizeseq<N>); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) +{ + return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) +{ + return select(m, vec<T, N>(x), vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y) +{ + return select(m, x, vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y) +{ + return select(m, vec<T, N>(x), y); +} + +#else + +// fallback +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y) +{ + return y ^ ((x ^ y) & m.asvec()); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) +{ + return select(m, vec<T, N>(x), vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y) +{ + return select(m, x, vec<T, N>(y)); +} +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y) +{ + return select(m, vec<T, N>(x), y); +} +#endif +template <typename T1, typename T2> +KFR_INTRINSIC common_type<T1, T2> select(bool m, const T1& x, const T2& y) +{ + return m ? x : y; +} + +} // namespace intrinsics +KFR_I_FN(select) +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/math/logical.hpp b/include/kfr/simd/logical.hpp diff --git a/include/kfr/math/min_max.hpp b/include/kfr/simd/min_max.hpp diff --git a/include/kfr/math/round.hpp b/include/kfr/simd/round.hpp diff --git a/include/kfr/math/saturation.hpp b/include/kfr/simd/saturation.hpp diff --git a/include/kfr/math/select.hpp b/include/kfr/simd/select.hpp diff --git a/include/kfr/simd/sort.hpp b/include/kfr/simd/sort.hpp @@ -0,0 +1,103 @@ +/** @addtogroup utility + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "min_max.hpp" +#include "shuffle.hpp" +#include "vec.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +/** + * @brief Sort the elements in the vector in ascending order + * @param x input vector + * @return sorted vector + * @code + * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(-10, 1, 2, 1000)); + * @endcode + */ +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x) +{ + constexpr size_t Nhalf = N / 2; + vec<T, Nhalf> e = low(x); + vec<T, Nhalf> o = high(x); + constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); + for (size_t i = 0; i < Nhalf; i++) + { + vec<T, Nhalf> t; + t = min(e, o); + o = max(e, o); + o = rotateright<1>(o); + e = t; + t = max(e, o); + o = min(e, o); + e = t; + t = blend(e, o, blend0); + o = blend(o, e, blend0); + o = rotateleft<1>(o); + e = t; + } + return interleavehalves(concat(e, o)); +} + +/** + * @brief Sort the elements in the vector in descending order + * @param x input vector + * @return sorted vector + * @code + * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(1000, 2, 1, -10)); + * @endcode + */ +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x) +{ + constexpr size_t Nhalf = N / 2; + vec<T, Nhalf> e = low(x); + vec<T, Nhalf> o = high(x); + constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>); + for (size_t i = 0; i < Nhalf; i++) + { + vec<T, Nhalf> t; + t = max(e, o); + o = min(e, o); + o = rotateright<1>(o); + e = t; + t = min(e, o); + o = max(e, o); + e = t; + t = blend(e, o, blend0); + o = blend(o, e, blend0); + o = rotateleft<1>(o); + e = t; + } + return interleavehalves(concat(e, o)); +} +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/sources.cmake b/sources.cmake @@ -27,13 +27,14 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/math_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/old_basic_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/random_bits.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/shape.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/tensor.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/static_array.hpp @@ -91,65 +92,66 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_mp3.h ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h - ${PROJECT_SOURCE_DIR}/include/kfr/math/abs.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/clamp.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/complex_math.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/gamma.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/hyperbolic.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/interpolation.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/logical.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/log_exp.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/min_max.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/modzerobessel.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/round.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/saturation.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/select.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/sin_cos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/sqrt.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/tan.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/abs.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/asin_acos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/atan.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/clamp.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/gamma.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/hyperbolic.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/logical.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/log_exp.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/min_max.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/modzerobessel.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/round.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/saturation.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/select.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sin_cos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sqrt.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/tan.hpp ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid.hpp ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid_auto.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/abs.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/clamp.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/comparison.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex_type.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/constants.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/digitreverse.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/horizontal.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/logical.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/mask.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/min_max.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/operators.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/platform.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/read_write.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/round.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/saturation.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/select.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/shuffle.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/sort.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/types.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/vec.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/abs.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_clang.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_generic.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_clang.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_complex.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/clamp.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/logical.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/min_max.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/round.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/saturation.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/select.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specialconstants.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h @@ -187,18 +189,18 @@ set( ${PROJECT_SOURCE_DIR}/tests/unit/base/tensor.cpp ${PROJECT_SOURCE_DIR}/tests/unit/graphics/color.cpp ${PROJECT_SOURCE_DIR}/tests/unit/graphics/geometry.cpp - ${PROJECT_SOURCE_DIR}/tests/unit/math/abs.cpp ${PROJECT_SOURCE_DIR}/tests/unit/math/asin_acos.cpp ${PROJECT_SOURCE_DIR}/tests/unit/math/atan.cpp ${PROJECT_SOURCE_DIR}/tests/unit/math/hyperbolic.cpp ${PROJECT_SOURCE_DIR}/tests/unit/math/log_exp.cpp - ${PROJECT_SOURCE_DIR}/tests/unit/math/min_max.cpp - ${PROJECT_SOURCE_DIR}/tests/unit/math/round.cpp - ${PROJECT_SOURCE_DIR}/tests/unit/math/select.cpp ${PROJECT_SOURCE_DIR}/tests/unit/math/sin_cos.cpp ${PROJECT_SOURCE_DIR}/tests/unit/math/tan.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/abs.cpp ${PROJECT_SOURCE_DIR}/tests/unit/simd/complex.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/min_max.cpp ${PROJECT_SOURCE_DIR}/tests/unit/simd/operators.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/round.cpp + ${PROJECT_SOURCE_DIR}/tests/unit/simd/select.cpp ${PROJECT_SOURCE_DIR}/tests/unit/simd/shuffle.cpp ${PROJECT_SOURCE_DIR}/tests/unit/simd/vec.cpp ) diff --git a/tests/unit/math/abs.cpp b/tests/unit/simd/abs.cpp diff --git a/tests/unit/math/min_max.cpp b/tests/unit/simd/min_max.cpp diff --git a/tests/unit/math/round.cpp b/tests/unit/simd/round.cpp diff --git a/tests/unit/math/select.cpp b/tests/unit/simd/select.cpp