commit 940f5d9c26ef2a724a5faf4f421470ca6ef72564
parent 51d4e60bd55b1a68556ac5ca2d443c970769adb1
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Wed, 12 Oct 2022 14:40:12 +0100
Refactoring
Diffstat:
49 files changed, 2199 insertions(+), 1994 deletions(-)
diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp
@@ -29,7 +29,6 @@
#include "base/expression.hpp"
#include "base/filter.hpp"
#include "base/fraction.hpp"
-#include "base/function_expressions.hpp"
#include "base/generators.hpp"
#include "base/math_expressions.hpp"
#include "base/memory.hpp"
@@ -38,5 +37,4 @@
#include "base/reduce.hpp"
#include "base/simd_expressions.hpp"
#include "base/small_buffer.hpp"
-#include "base/sort.hpp"
#include "base/univector.hpp"
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -93,8 +93,8 @@ struct expression_traits<xcounter<T, Dims>> : expression_traits_defaults
using value_type = T;
constexpr static size_t dims = Dims;
- constexpr static shape<dims> shapeof(const xcounter<T, Dims>& self) { return shape<dims>(max_index_t); }
- constexpr static shape<dims> shapeof() { return shape<dims>(max_index_t); }
+ constexpr static shape<dims> shapeof(const xcounter<T, Dims>& self) { return shape<dims>(infinite_size); }
+ constexpr static shape<dims> shapeof() { return shape<dims>(infinite_size); }
};
template <typename T, typename... Args, typename Tout = std::common_type_t<T, Args...>>
@@ -151,9 +151,9 @@ struct expression_traits<xslice<Arg>> : expression_traits_defaults
KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xslice<Arg>& self)
{
- return min(ArgTraits::shapeof(self.first()).sub_inf(self.start), self.size);
+ return min(sub_shape(ArgTraits::shapeof(self.first()), self.start), self.size);
}
- KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(0); }
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(undefined_size); }
};
template <typename Arg, KFR_ACCEPT_EXPRESSIONS(Arg), index_t Dims = expression_dims<Arg>>
@@ -333,8 +333,11 @@ struct expression_traits<xpadded<Arg>> : expression_traits_defaults
constexpr static size_t dims = ArgTraits::dims;
constexpr static bool random_access = ArgTraits::random_access;
- KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xpadded<Arg>& self) { return infinite_size; }
- KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return infinite_size; }
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xpadded<Arg>& self)
+ {
+ return shape<dims>(infinite_size);
+ }
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>(infinite_size); }
};
inline namespace CMT_ARCH_NAME
@@ -516,7 +519,7 @@ struct expression_traits<xreshape<Arg, OutDims>> : expression_traits_defaults
{
return self.out_shape;
}
- KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ 0 }; }
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ undefined_size }; }
};
inline namespace CMT_ARCH_NAME
@@ -647,7 +650,7 @@ struct expression_traits<xlinspace<T, truncated>> : expression_traits_defaults
{
return shape<dims>(truncated ? self.size : infinite_size);
}
- constexpr static shape<dims> shapeof() { return shape<dims>(truncated ? 0 : infinite_size); }
+ constexpr static shape<dims> shapeof() { return shape<dims>(truncated ? undefined_size : infinite_size); }
};
template <bool truncated = false, typename T1, typename T2, typename Tout = std::common_type_t<T1, T2>>
@@ -672,4 +675,106 @@ KFR_INTRINSIC vec<T, N> get_elements(const xlinspace<T, truncated>& self, const
// ----------------------------------------------------------------------------
+template <typename Arg1, typename Arg2, index_t ConcatAxis>
+struct xconcatenate : public xwitharguments<Arg1, Arg2>
+{
+ static_assert(expression_dims<Arg1> == expression_dims<Arg2>);
+ static_assert(std::is_same_v<expression_value_type<Arg1>, expression_value_type<Arg2>>);
+ constexpr static index_t dims = expression_dims<Arg1>;
+ shape<dims> size1;
+
+ KFR_MEM_INTRINSIC xconcatenate(Arg1&& arg1, Arg2&& arg2)
+ : xwitharguments<Arg1, Arg2>{ std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) },
+ size1(expression_traits<Arg1>::shapeof(arg1))
+ {
+ }
+};
+
+template <typename Arg1, typename Arg2, index_t ConcatAxis>
+struct expression_traits<xconcatenate<Arg1, Arg2, ConcatAxis>> : expression_traits_defaults
+{
+ using ArgTraits1 = expression_traits<Arg1>;
+ using ArgTraits2 = expression_traits<Arg2>;
+
+ using value_type = typename ArgTraits1::value_type;
+ constexpr static size_t dims = ArgTraits1::dims;
+ constexpr static bool random_access = ArgTraits1::random_access && ArgTraits2::random_access;
+
+ KFR_INTRINSIC static shape<dims> concat_shape(const shape<dims>& sh1, const shape<dims>& sh2)
+ {
+ shape<dims> result = min(sh1, sh2);
+ shape<dims> sum = add_shape_undef(sh1, sh2);
+ result[ConcatAxis] = sum[ConcatAxis];
+ return result;
+ }
+
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof(const xconcatenate<Arg1, Arg2, ConcatAxis>& self)
+ {
+ return concat_shape(ArgTraits1::shapeof(std::get<0>(self)), ArgTraits2::shapeof(std::get<1>(self)));
+ }
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof()
+ {
+ return concat_shape(ArgTraits1::shapeof(), ArgTraits2::shapeof());
+ }
+};
+
+template <index_t ConcatAxis = 0, typename Arg1, typename Arg2, KFR_ACCEPT_EXPRESSIONS(Arg1, Arg2)>
+KFR_INTRINSIC xconcatenate<Arg1, Arg2, ConcatAxis> concatenate(Arg1&& arg1, Arg2&& arg2)
+{
+ return { std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) };
+}
+
+template <index_t ConcatAxis = 0, typename Arg1, typename Arg2, typename Arg3,
+ KFR_ACCEPT_EXPRESSIONS(Arg1, Arg2, Arg3)>
+KFR_INTRINSIC xconcatenate<Arg1, xconcatenate<Arg2, Arg3, ConcatAxis>, ConcatAxis> concatenate(Arg1&& arg1,
+ Arg2&& arg2,
+ Arg3&& arg3)
+{
+ return { std::forward<Arg1>(arg1), { std::forward<Arg2>(arg2), std::forward<Arg3>(arg3) } };
+}
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename Arg1, typename Arg2, index_t ConcatAxis, index_t NDims, index_t Axis, size_t N,
+ typename T = typename expression_traits<xconcatenate<Arg1, Arg2, ConcatAxis>>::value_type>
+KFR_INTRINSIC vec<T, N> get_elements(const xconcatenate<Arg1, Arg2, ConcatAxis>& self,
+ const shape<NDims>& index, const axis_params<Axis, N>& sh)
+{
+ const index_t size1 = self.size1;
+ constexpr index_t Naxis = ConcatAxis == Axis ? N : 1;
+ if (index[ConcatAxis] >= size1[ConcatAxis])
+ {
+ shape index1 = index;
+ index1[ConcatAxis] -= size1[ConcatAxis];
+ return get_elements(std::get<1>(self.args), index1, sh);
+ }
+ else if (CMT_LIKELY(index[ConcatAxis] + Naxis <= size1[ConcatAxis]))
+ {
+ return get_elements(std::get<0>(self.args), index, sh);
+ }
+ else // (index < size1) && (index + N > size1)
+ {
+ vec<T, N> result;
+ // Here Axis == ConcatAxis
+ shape index1 = index;
+ for (index_t i = 0; i < size1[ConcatAxis] - index[ConcatAxis]; ++i)
+ {
+ result[i] = get_elements(std::get<0>(self.args), index1, axis_params<Axis, 1>{})[0];
+ ++index1[ConcatAxis];
+ }
+ index1[ConcatAxis] -= size1[ConcatAxis];
+ for (index_t i = size1[ConcatAxis] - index[ConcatAxis]; i < N; ++i)
+ {
+ result[i] = get_elements(std::get<1>(self.args), index1, axis_params<Axis, 1>{})[0];
+ ++index1[ConcatAxis];
+ }
+ return result;
+ }
+}
+
+} // namespace CMT_ARCH_NAME
+
+// ----------------------------------------------------------------------------
+
} // namespace kfr
diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp
@@ -25,7 +25,7 @@
*/
#pragma once
-#include "../math/clamp.hpp"
+#include "../simd/clamp.hpp"
#include "../simd/types.hpp"
#include "../simd/vec.hpp"
#include "univector.hpp"
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -131,7 +131,7 @@ using enable_if_output_expression =
vec<typename expression_traits<T>::value_type, 1>{}))>;
template <typename T>
-using enable_if_inout_output_expression =
+using enable_if_input_output_expression =
std::void_t<enable_if_input_expression<T>, enable_if_output_expression<T>>;
template <typename... T>
@@ -140,6 +140,27 @@ using enable_if_input_expressions = std::void_t<enable_if_input_expression<T>...
template <typename... T>
using enable_if_output_expressions = std::void_t<enable_if_output_expression<T>...>;
+template <typename... T>
+using enable_if_input_output_expressions = std::void_t<enable_if_input_output_expression<T>...>;
+
+template <typename E, typename = void>
+constexpr inline bool is_input_expression = false;
+
+template <typename E>
+constexpr inline bool is_input_expression<E, enable_if_input_expression<E>> = true;
+
+template <typename E, typename = void>
+constexpr inline bool is_output_expression = false;
+
+template <typename E>
+constexpr inline bool is_output_expression<E, enable_if_output_expression<E>> = true;
+
+template <typename E, typename = void>
+constexpr inline bool is_input_output_expression = false;
+
+template <typename E>
+constexpr inline bool is_input_output_expression<E, enable_if_input_output_expression<E>> = true;
+
#define KFR_ACCEPT_EXPRESSIONS(...) internal_generic::expressions_check<__VA_ARGS__>* = nullptr
template <typename T>
@@ -311,7 +332,7 @@ static auto process(Out&& out, In&& in, shape<outdims> start = shape<outdims>(0)
const shape<indims> inshape = Trin::shapeof(in);
if (CMT_UNLIKELY(!internal_generic::can_assign_from(outshape, inshape)))
return shape<outdims>{ 0 };
- shape<outdims> stop = min(start.add_inf(size), outshape);
+ shape<outdims> stop = min(add_shape(start, size), outshape);
index_t in_size = 0;
if constexpr (indims > 0)
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -26,107 +26,108 @@
#pragma once
#include "../math/log_exp.hpp"
-#include "../math/select.hpp"
#include "../math/sin_cos.hpp"
#include "../simd/impl/function.hpp"
+#include "../simd/select.hpp"
#include "../simd/vec.hpp"
+#include "shape.hpp"
namespace kfr
{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace internal
-{
-template <typename T, size_t width_, typename Class>
-struct generator : input_expression
+template <typename T, size_t VecWidth, typename Class, typename Twork = T>
+struct xgenerator
{
- constexpr static size_t width = width_;
- using value_type = T;
-
- constexpr static bool is_incremental = true;
-
- template <typename U, size_t N>
- friend KFR_INTRINSIC vec<U, N> get_elements(const generator& self, cinput_t, size_t index,
- vec_shape<U, N> t)
- {
- return self.generate(t);
- }
+ constexpr static size_t width = VecWidth;
void resync(T start) const { ptr_cast<Class>(this)->sync(start); }
-protected:
- void call_next() const { ptr_cast<Class>(this)->next(); }
template <size_t N>
- void call_shift(csize_t<N>) const
+ KFR_MEM_INTRINSIC vec<T, N> generate() const
{
- ptr_cast<Class>(this)->shift(csize_t<N>());
+ if constexpr (N < width)
+ {
+ const vec<T, N> result = narrow<N>(call_get_value());
+ const vec<Twork, width> oldvalue = value;
+ call_next();
+ value = slice<N, width>(oldvalue, value);
+ return result;
+ }
+ else if (N > width)
+ {
+ const vec lo = generate(low(x));
+ const vec hi = generate(high(x));
+ return concat(lo, hi);
+ }
+ else // N == width
+ {
+ const vec<T, N> result = call_get_value();
+ call_next();
+ return result;
+ }
}
+ mutable vec<Twork, width> value;
- template <size_t N>
- void shift(csize_t<N>) const
- {
- const vec<T, width> oldvalue = value;
- call_next();
- value = slice<N, width>(oldvalue, value);
- }
+private:
+ KFR_MEM_INTRINSIC void call_next() const { ptr_cast<Class>(this)->next(); }
- template <size_t N, KFR_ENABLE_IF(N == width)>
- KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
- {
- const vec<T, N> result = value;
- call_next();
- return result;
- }
+ KFR_MEM_INTRINSIC vec<T, width> call_get_value() const { return ptr_cast<Class>(this)->get_value(); }
- template <size_t N, KFR_ENABLE_IF(N < width)>
- KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
+ KFR_MEM_INTRINSIC std::enable_if_t<std::is_same_v<T, Twork>, vec<T, width>> get_value() const
{
- const vec<T, N> result = narrow<N>(value);
- call_shift(csize_t<N>());
- return result;
+ return value;
}
+};
- template <size_t N, KFR_ENABLE_IF(N > width)>
- KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N> x) const
- {
- const auto lo = generate(low(x));
- const auto hi = generate(high(x));
- return concat(lo, hi);
- }
+template <typename T, size_t VecWidth, typename Class>
+struct expression_traits<xgenerator<T, VecWidth, Class>> : public expression_traits_defaults
+{
+ using value_type = T;
+ constexpr static size_t dims = 1;
+ constexpr static shape<1> shapeof(const T&) { return shape<1>(infinite_size); }
+ constexpr static shape<1> shapeof() { return shape<1>(infinite_size); }
- mutable vec<T, width> value;
+ constexpr static inline bool explicit_operand = true;
+ constexpr static inline bool random_access = false;
};
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_linear : generator<T, width, generator_linear<T, width>>
+inline namespace CMT_ARCH_NAME
+{
+template <typename T, size_t VecWidth, typename Class, size_t N>
+KFR_INTRINSIC vec<T, N> get_elements(const xgenerator<T, VecWidth, Class>& self, const shape<1>& index,
+ const axis_params<0, N>&)
+{
+ return self.template generate<N>();
+}
+} // namespace CMT_ARCH_NAME
+
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgenlinear : public xgenerator<T, VecWidth, xgenlinear<T, VecWidth>>
{
- generator_linear(T start, T step) CMT_NOEXCEPT : step(step), vstep(step* width) { this->resync(start); }
+ xgenlinear(T start, T step) CMT_NOEXCEPT : step{ step }, vstep{ step * VecWidth } { sync(start); }
KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
{
- this->value = start + enumerate<T, width>() * step;
+ this->value = start + enumerate(vec_shape<T, VecWidth>{}, vstep / VecWidth);
}
KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += vstep; }
-protected:
- T step;
T vstep;
};
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_exp : generator<T, width, generator_exp<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgenexp : public xgenerator<T, VecWidth, xgenexp<T, VecWidth>>
{
- generator_exp(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp(make_vector(step* width))[0] - 1)
+ xgenexp(T start, T step) CMT_NOEXCEPT : step{ step },
+ vstep{ exp(make_vector(step * VecWidth)).front() - 1 }
{
this->resync(start);
}
KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
{
- this->value = exp(start + enumerate<T, width>() * step);
+ this->value = exp(start + enumerate<T, VecWidth>() * step);
}
KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
@@ -136,14 +137,14 @@ protected:
T vstep;
};
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2),
- std::enable_if_t<std::is_same<complex<deep_subtype<T>>, T>::value, int> = 0>
-struct generator_expj : generator<T, width, generator_expj<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<deep_subtype<T>> / 8 / 2>
+struct xgenexpj : public xgenerator<T, VecWidth, xgenexpj<T, VecWidth>>
{
using ST = deep_subtype<T>;
+ static_assert(std::is_same_v<complex<deep_subtype<T>>, T>, "xgenexpj requires complex type");
- generator_expj(ST start_, ST step_)
- : step(step_), alpha(2 * sqr(sin(width * step / 2))), beta(-sin(width * step))
+ xgenexpj(ST start_, ST step_)
+ : step(step_), alpha(2 * sqr(sin(VecWidth * step / 2))), beta(-sin(VecWidth * step))
{
this->resync(T(start_));
}
@@ -156,26 +157,26 @@ struct generator_expj : generator<T, width, generator_expj<T, width>>
}
protected:
- ST const step;
- ST const alpha;
- ST const beta;
- CMT_NOINLINE static vec<T, width> init_cossin(ST w, ST phase)
+ ST step;
+ ST alpha;
+ ST beta;
+ CMT_NOINLINE static vec<T, VecWidth> init_cossin(ST w, ST phase)
{
return ccomp(cossin(dup(phase + enumerate<ST, width>() * w)));
}
};
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgenexp2 : public xgenerator<T, VecWidth, xgenexp2<T, VecWidth>>
{
- generator_exp2(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
+ xgenexp2(T start, T step) CMT_NOEXCEPT : step{ step }, vstep{ exp2(make_vector(step * VecWidth))[0] - 1 }
{
this->resync(start);
}
KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
{
- this->value = exp2(start + enumerate<T, width>() * step);
+ this->value = exp2(start + enumerate(vec_shape<T, VecWidth>, step));
}
KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
@@ -185,11 +186,12 @@ protected:
T vstep;
};
-template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
-struct generator_cossin : generator<T, width, generator_cossin<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8>
+struct xgencossin : public xgenerator<T, VecWidth, xgencossin<T, VecWidth>>
{
- generator_cossin(T start, T step)
- : step(step), alpha(2 * sqr(sin(width / 2 * step / 2))), beta(-sin(width / 2 * step))
+ static_assert(VecWidth % 2 == 0);
+ xgencossin(T start, T step)
+ : step(step), alpha(2 * sqr(sin(VecWidth / 2 * step / 2))), beta(-sin(VecWidth / 2 * step))
{
this->resync(start);
}
@@ -204,56 +206,50 @@ protected:
T step;
T alpha;
T beta;
- CMT_NOINLINE static vec<T, width> init_cossin(T w, T phase)
+ CMT_NOINLINE static vec<T, VecWidth> init_cossin(T w, T phase)
{
- return cossin(dup(phase + enumerate<T, width / 2>() * w));
+ return cossin(dup(phase + enumerate(vec_shape<T, VecWidth / 2>, w)));
}
};
-template <typename T, size_t width = vector_width<T>* bitness_const(2, 4)>
-struct generator_sin : generator<T, width, generator_sin<T, width>>
+template <typename T, size_t VecWidth = vector_capacity<T> / 8 / 2>
+struct xgensin : public xgenerator<T, VecWidth, xgensin<T, VecWidth>, vec<T, 2>>
{
- generator_sin(T start, T step)
- : step(step), alpha(2 * sqr(sin(width * step / 2))), beta(sin(width * step))
+ xgensin(T start, T step)
+ : step(step), alpha(2 * sqr(sin(VecWidth * step / 2))), beta(sin(VecWidth * step))
{
this->resync(start);
}
KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
{
- const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step)));
- this->cos_value = low(cs);
- this->value = high(cs);
+ const vec<T, 2 * VecWidth> cs = cossin(dup(start + enumerate(vec_shape<T, VecWidth>, step)));
+ this->value = vec<vec<T, 2>, VecWidth>::from_flatten(cs);
}
KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
{
- const vec<T, width> c = this->cos_value;
- const vec<T, width> s = this->value;
+ const vec<T, 2 * VecWidth> cs = flatten(this->value);
- const vec<T, width> cc = alpha * c + beta * s;
- const vec<T, width> ss = alpha * s - beta * c;
+ cs = cs - addsub(alpha * cs, beta * swap<2>(cs));
- this->cos_value = c - cc;
- this->value = s - ss;
- }
+ this->value = vec<vec<T, 2>, VecWidth>::from_flatten(cs);
- template <size_t N>
- void shift(csize_t<N>) const CMT_NOEXCEPT
- {
- const vec<T, width> oldvalue = this->value;
- const vec<T, width> oldcosvalue = this->cos_value;
- next();
- this->value = slice<N, width>(oldvalue, this->value);
- this->cos_value = slice<N, width>(oldcosvalue, this->cos_value);
+ // const vec<T, VecWidth> c = even(flatten(this->value));
+ // const vec<T, VecWidth> s = odd(flatten(this->value));
+ // const vec<T, VecWidth> cc = alpha * c + beta * s;
+ // const vec<T, VecWidth> ss = alpha * s - beta * c;
+ // this->cos_value = c - cc;
+ // this->value = s - ss;
}
protected:
T step;
T alpha;
T beta;
- mutable vec<T, width> cos_value;
};
-} // namespace internal
+
+inline namespace CMT_ARCH_NAME
+{
/**
* @brief Returns template expression that generates values starting from the start and using the step as the
@@ -264,9 +260,9 @@ protected:
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step)
+KFR_FUNCTION xgenlinear<TF> gen_linear(T1 start, T2 step)
{
- return internal::generator_linear<TF>(start, step);
+ return xgenlinear<TF>(start, step);
}
/**
@@ -276,9 +272,9 @@ KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
+KFR_FUNCTION xgenexp<TF> gen_exp(T1 start, T2 step)
{
- return internal::generator_exp<TF>(start, step);
+ return xgenexp<TF>(start, step);
}
/**
@@ -288,9 +284,9 @@ KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = complex<ftype<common_type<T1, T2>>>>
-KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step)
+KFR_FUNCTION xgenexpj<TF> gen_expj(T1 start, T2 step)
{
- return internal::generator_expj<TF>(start, step);
+ return xgenexpj<TF>(start, step);
}
/**
@@ -300,9 +296,9 @@ KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
+KFR_FUNCTION xgenexp2<TF> gen_exp2(T1 start, T2 step)
{
- return internal::generator_exp2<TF>(start, step);
+ return xgenexp2<TF>(start, step);
}
/**
@@ -316,9 +312,9 @@ KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
+KFR_FUNCTION xgencossin<TF> gen_cossin(T1 start, T2 step)
{
- return internal::generator_cossin<TF>(start, step);
+ return xgencossin<TF>(start, step);
}
/**
@@ -328,9 +324,9 @@ KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_FUNCTION internal::generator_sin<TF> gen_sin(T1 start, T2 step)
+KFR_FUNCTION xgensin<TF> gen_sin(T1 start, T2 step)
{
- return internal::generator_sin<TF>(start, step);
+ return xgensin<TF>(start, step);
}
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -38,6 +38,15 @@ struct expression_pointer;
template <typename T>
constexpr size_t maximum_expression_width = vector_width_for<T, cpu_t::highest> * 2;
+template <typename T, template <T...> typename Tpl, typename Pack>
+struct expand_cvals;
+
+template <typename T, template <T...> typename Tpl, T... vals>
+struct expand_cvals<T, Tpl, cvals_t<T, vals...>>
+{
+ using type = Tpl<vals...>;
+};
+
inline namespace CMT_ARCH_NAME
{
@@ -50,64 +59,57 @@ KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& n
}
} // namespace CMT_ARCH_NAME
-template <typename T, size_t N = maximum_expression_width<T>>
-struct expression_vtable : expression_vtable<T, N / 2>
+template <typename T, index_t Dims>
+struct expression_vtable
{
- using func_get = void (*)(void*, size_t, vec<T, N>&);
- func_get get;
+ constexpr static const size_t Nsizes = 1 + ilog2(maximum_expression_width<T>);
- template <typename Expression>
- expression_vtable(ctype_t<Expression> t)
- : expression_vtable<T, N / 2>(t), get(&expression_vtable<T, N>::template static_get<Expression>)
- {
- }
+ using func_get = void (*)(void*, shape<Dims>, T*);
+ using func_set = void (*)(void*, shape<Dims>, const T*);
+ using func_shapeof = void (*)(void*, shape<Dims>&);
+ using func_substitute = bool (*)(void*, expression_pointer<T>&&);
- template <typename Expression>
- static void static_get(void* instance, size_t index, vec<T, N>& result)
- {
- result = get_elements(*static_cast<Expression*>(instance), cinput, index, vec_shape<T, N>());
- }
-};
-
-template <typename T>
-struct expression_vtable<T, 0>
-{
- using func_size = size_t (*)(void* p);
- using func_begin_block = void (*)(void*, size_t);
- using func_end_block = void (*)(void*, size_t);
- using func_substitute = bool (*)(void*, expression_pointer<T>&&);
-
- func_size size;
- func_begin_block begin_block;
- func_end_block end_block;
- func_substitute substitute;
+ func_shapeof fn_shapeof;
+ func_substitute fn_substitute;
+ std::array<std::array<func_get, Nsizes>, Dims> fn_get_elements;
+ std::array<std::array<func_set, Nsizes>, Dims> fn_set_elements;
template <typename Expression>
- expression_vtable(ctype_t<Expression>)
- : size(&expression_vtable<T, 0>::template static_size<Expression>),
- begin_block(&expression_vtable<T, 0>::template static_begin_block<Expression>),
- end_block(&expression_vtable<T, 0>::template static_end_block<Expression>),
- substitute(&expression_vtable<T, 0>::template static_substitute<Expression>)
+ expression_vtable(ctype_t<Expression> t)
{
+ fn_shapeof = &static_shapeof<Expression>;
+ fn_substitute = &static_substitute<Expression>;
+ cforeach(csizeseq<Nsizes>,
+ [&](size_t size) CMT_INLINE_LAMBDA
+ {
+ cforeach(csizeseq<Dims>,
+ [&](size_t axis) CMT_INLINE_LAMBDA
+ {
+ fn_get_elements[axis][size] =
+ &expression_vtable::static_get_elements<Expression, 1 << size, axis>;
+ fn_set_elements[axis][size] =
+ &expression_vtable::static_set_elements<Expression, 1 << size, axis>;
+ });
+ });
}
- template <typename Expression>
- static size_t static_size(void* instance)
+ template <typename Expression, size_t N, index_t VecAxis>
+ static void static_get_elements(void* instance, shape<Dims> index, T* dest)
{
- return static_cast<Expression*>(instance)->size();
+ write(dest, get_elements(*static_cast<Expression*>(instance), index, axis_params_v<VecAxis, N>));
}
- template <typename Expression>
- static void static_begin_block(void* instance, size_t size)
+ template <typename Expression, size_t N, index_t VecAxis>
+ static void static_set_elements(void* instance, shape<Dims> index, const T* src)
{
- return static_cast<Expression*>(instance)->begin_block(cinput, size);
+ set_elements(*static_cast<Expression*>(instance), index, axis_params_v<VecAxis, N>, read<N>(src));
}
template <typename Expression>
- static void static_end_block(void* instance, size_t size)
+ static void static_shapeof(void* instance, shape<Dims>& result)
{
- return static_cast<Expression*>(instance)->end_block(cinput, size);
+ result = expression_traits<Expression>::shapeof(*static_cast<Expression*>(instance));
}
template <typename Expression>
- static bool static_substitute(void* instance, expression_pointer<T>&& ptr)
+ static bool static_substitute(void* instance, expression_pointer<T> ptr)
{
return internal::invoke_substitute(*static_cast<Expression*>(instance), std::move(ptr));
}
@@ -143,51 +145,77 @@ KFR_INTRINSIC std::shared_ptr<expression_resource> make_resource(E&& e)
new (aligned_allocate<T>()) T(std::move(e)), [](T* pi) { aligned_deallocate<T>(pi); }));
}
-template <typename T, bool enable_resource>
-struct expression_pointer : input_expression
+template <typename T, index_t Dims>
+struct xpointer
{
- using value_type = T;
+ void* instance;
+ const expression_vtable<T, Dims>* vtable;
+ std::shared_ptr<expression_resource> resource;
- expression_pointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {}
- expression_pointer(const void* instance, const expression_vtable<T>* vtable,
- std::shared_ptr<expression_resource> resource = nullptr)
+ xpointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {}
+ xpointer(const void* instance, const expression_vtable<T, Dims>* vtable,
+ std::shared_ptr<expression_resource> resource = nullptr)
: instance(const_cast<void*>(instance)), vtable(vtable), resource(std::move(resource))
{
}
- template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width<T>)>
- friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t, size_t index,
- vec_shape<T, N>)
+
+ explicit operator bool() const { return instance != nullptr; }
+};
+
+template <typename T, index_t Dims>
+struct expression_traits<xpointer<T, Dims>> : expression_traits_defaults
+{
+ using value_type = T;
+ constexpr static size_t dims = Dims;
+ constexpr static shape<dims> shapeof(const xpointer<T, Dims>& self)
{
- static_assert(is_poweroftwo(N), "N must be a power of two");
- vec<T, N> result;
- static_cast<const expression_vtable<T, N>*>(self.vtable)->get(self.instance, index, result);
- return result;
+ shape<dims> result;
+ self.vtable->fn_shapeof(self.instance, result);
}
- template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width<T>)>
- friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t cinput, size_t index,
- vec_shape<T, N>)
+ constexpr static shape<dims> shapeof() { return shape<dims>(undefined_size); }
+
+ constexpr static inline bool random_access = false;
+};
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, index_t NDims, index_t Axis, size_t N>
+KFR_INTRINSIC vec<T, N> get_elements(const xpointer<T, NDims>& self, const shape<NDims>& index,
+ const axis_params<Axis, N>& sh)
+{
+ static_assert(is_poweroftwo(N) && N >= 1);
+ if constexpr (N > expression_vtable<T, NDims>::Nmax)
{
- static_assert(is_poweroftwo(N), "N must be a power of two");
- const vec<T, N / 2> r1 = get_elements(self, cinput, index, vec_shape<T, N / 2>());
- const vec<T, N / 2> r2 = get_elements(self, cinput, index + N / 2, vec_shape<T, N / 2>());
- return concat(r1, r2);
+ constexpr size_t Nhalf = N / 2;
+ return concat(get_elements(self, index, axis_params_v<Axis, Nhalf>),
+ get_elements(self, index.add_at(Axis, Nhalf), axis_params_v<Axis, Nhalf>));
}
- KFR_MEM_INTRINSIC void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); }
- KFR_MEM_INTRINSIC void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); }
- KFR_MEM_INTRINSIC size_t size() const { return vtable->size(instance); }
-
- KFR_MEM_INTRINSIC bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const
+ else
{
- return vtable->substitute(instance, std::move(new_pointer));
+ portable_vec<T, N> result;
+ self.vtable->fn_get_elements[Axis][ilog2(N)](self.instance, index, result.elem);
+ return result;
}
+}
- explicit operator bool() const { return instance != nullptr; }
-
-private:
- void* instance;
- const expression_vtable<T>* vtable;
- std::shared_ptr<expression_resource> resource;
-};
+template <typename T, index_t NDims, index_t Axis, size_t N>
+KFR_INTRINSIC void set_elements(const xpointer<T, NDims>& self, const shape<NDims>& index,
+ const axis_params<Axis, N>& sh, const identity<vec<T, N>>& value)
+{
+ static_assert(is_poweroftwo(N) && N >= 1);
+ if constexpr (N > expression_vtable<T, NDims>::Nmax)
+ {
+ constexpr size_t Nhalf = N / 2;
+ set_elements(self, index, axis_params_v<Axis, Nhalf>, slice<0, Nhalf>(value));
+ set_elements(self, index.add_at(Axis, Nhalf), axis_params_v<Axis, Nhalf>, slice<Nhalf, Nhalf>(value));
+ }
+ else
+ {
+ self.vtable->fn_set_elements[Axis][ilog2(N)](self.instance, index, &value.front());
+ }
+}
+} // namespace CMT_ARCH_NAME
inline namespace CMT_ARCH_NAME
{
@@ -195,11 +223,10 @@ inline namespace CMT_ARCH_NAME
namespace internal
{
-template <typename T, typename E>
-KFR_INTRINSIC expression_vtable<T>* make_expression_vtable()
+template <typename T, index_t Dims, typename E>
+KFR_INTRINSIC expression_vtable<T, Dims>* make_expression_vtable()
{
- static_assert(is_input_expression<E>, "E must be an expression");
- static expression_vtable<T> vtable{ ctype_t<decay<E>>{} };
+ static expression_vtable<T, Dims> vtable{ ctype_t<decay<E>>{} };
return &vtable;
}
} // namespace internal
@@ -208,26 +235,25 @@ KFR_INTRINSIC expression_vtable<T>* make_expression_vtable()
* This overload takes reference to the expression.
* @warning Use with caution with local variables.
*/
-template <typename E, typename T = value_type_of<E>>
-KFR_INTRINSIC expression_pointer<T> to_pointer(E& expr)
+template <typename E, typename T = expression_value_type<E>, index_t Dims = expression_dims<E>>
+KFR_INTRINSIC expression_pointer<T, Dims> to_pointer(E& expr)
{
- static_assert(is_input_expression<E>, "E must be an expression");
- return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, E>());
+ return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, Dims, E>());
}
/** @brief Converts the given expression into an opaque object.
* This overload takes ownership of the expression (Move semantics).
* @note Use std::move to force use of this overload.
*/
-template <typename E, typename T = value_type_of<E>>
-KFR_INTRINSIC expression_pointer<T> to_pointer(E&& expr)
+template <typename E, typename T = expression_value_type<E>, index_t Dims = expression_dims<E>>
+KFR_INTRINSIC expression_pointer<T, Dims> to_pointer(E&& expr)
{
- static_assert(is_input_expression<E>, "E must be an expression");
std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr));
void* instance = ptr->instance();
- return expression_pointer<T>(instance, internal::make_expression_vtable<T, E>(), std::move(ptr));
+ return expression_pointer<T, Dims>(instance, internal::make_expression_vtable<T, E>(), std::move(ptr));
}
+#if 0
template <typename T, size_t key>
class expression_placeholder : public input_expression
{
@@ -306,5 +332,7 @@ KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& n
}
} // namespace internal
+#endif
} // namespace CMT_ARCH_NAME
-} // namespace kfr
+
+}
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -25,92 +25,14 @@
*/
#pragma once
-#include "../simd/impl/function.hpp"
-#include "../simd/operators.hpp"
-#include "../simd/shuffle.hpp"
-#include "../simd/vec.hpp"
-#include "expression.hpp"
-#include <functional>
-
-#ifdef CMT_ARCH_ARM
-#define KFR_DISABLE_READCYCLECOUNTER
-#endif
+#include "random_bits.hpp"
namespace kfr
{
-#ifndef KFR_DISABLE_READCYCLECOUNTER
-struct seed_from_rdtsc_t
-{
-};
-
-constexpr seed_from_rdtsc_t seed_from_rdtsc{};
-#endif
-
inline namespace CMT_ARCH_NAME
{
-using random_state = u32x4;
-
-#ifndef KFR_DISABLE_READCYCLECOUNTER
-#ifdef CMT_COMPILER_CLANG
-#define KFR_builtin_readcyclecounter() \
- static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here
-#else
-#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc())
-#endif
-#endif
-
-struct random_bit_generator
-{
-#ifndef KFR_DISABLE_READCYCLECOUNTER
- KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT
- : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
- (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
- {
- (void)operator()();
- }
-#endif
- KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT
- : state(x0, x1, x2, x3)
- {
- (void)operator()();
- }
- KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT
- : state(bitcast<u32>(make_vector(x0, x1)))
- {
- (void)operator()();
- }
-
- KFR_MEM_INTRINSIC random_state operator()()
- {
- const static random_state mul{ 214013u, 17405u, 214013u, 69069u };
- const static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
- state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add))));
- return state;
- }
-
-protected:
- random_state state;
-};
-
-static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16");
-
-template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))>
-KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
-{
- return narrow<N>(bitcast<u8>(gen()));
-}
-
-template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
-KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
-{
- constexpr size_t N2 = prev_poweroftwo(N - 1);
- const vec<u8, N2> bits1 = random_bits<N2>(gen);
- const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen);
- return concat(bits1, bits2);
-}
-
template <typename T, size_t N, KFR_ENABLE_IF(is_integral<T>)>
KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen)
{
diff --git a/include/kfr/base/random_bits.hpp b/include/kfr/base/random_bits.hpp
@@ -0,0 +1,114 @@
+/** @addtogroup random
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/vec.hpp"
+#include "expression.hpp"
+#include <functional>
+
+#ifdef CMT_ARCH_ARM
+#define KFR_DISABLE_READCYCLECOUNTER
+#endif
+
+namespace kfr
+{
+
+#ifndef KFR_DISABLE_READCYCLECOUNTER
+struct seed_from_rdtsc_t
+{
+};
+
+constexpr seed_from_rdtsc_t seed_from_rdtsc{};
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+using random_state = u32x4;
+
+#ifndef KFR_DISABLE_READCYCLECOUNTER
+#ifdef CMT_COMPILER_CLANG
+#define KFR_builtin_readcyclecounter() \
+ static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here
+#else
+#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc())
+#endif
+#endif
+
+struct random_bit_generator
+{
+#ifndef KFR_DISABLE_READCYCLECOUNTER
+ KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT
+ : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
+ (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
+ {
+ (void)operator()();
+ }
+#endif
+ KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT
+ : state(x0, x1, x2, x3)
+ {
+ (void)operator()();
+ }
+ KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT
+ : state(bitcast<u32>(make_vector(x0, x1)))
+ {
+ (void)operator()();
+ }
+
+ KFR_MEM_INTRINSIC random_state operator()()
+ {
+ const static random_state mul{ 214013u, 17405u, 214013u, 69069u };
+ const static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
+ state = bitcast<u32>(rotateright<3>(bitcast<u8>(fmadd(state, mul, add))));
+ return state;
+ }
+
+protected:
+ random_state state;
+};
+
+static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16");
+
+template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))>
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
+{
+ return narrow<N>(bitcast<u8>(gen()));
+}
+
+template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
+{
+ constexpr size_t N2 = prev_poweroftwo(N - 1);
+ const vec<u8, N2> bits1 = random_bits<N2>(gen);
+ const vec<u8, N - N2> bits2 = random_bits<N - N2>(gen);
+ return concat(bits1, bits2);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp
@@ -25,7 +25,7 @@
*/
#pragma once
-#include "../math/min_max.hpp"
+#include "../simd/min_max.hpp"
#include "../simd/horizontal.hpp"
#include "../simd/impl/function.hpp"
#include "../simd/operators.hpp"
diff --git a/include/kfr/base/shape.hpp b/include/kfr/base/shape.hpp
@@ -27,7 +27,7 @@
#include "impl/static_array.hpp"
-#include "../math/min_max.hpp"
+#include "../simd/min_max.hpp"
#include "../simd/shuffle.hpp"
#include "../simd/types.hpp"
#include "../simd/vec.hpp"
@@ -60,6 +60,8 @@ constexpr inline cindex_t<val> cindex{};
constexpr inline index_t infinite_size = max_index_t;
+constexpr inline index_t undefined_size = 0;
+
constexpr inline index_t maximum_dims = 8;
using dimset = vec<i8, maximum_dims>;
@@ -142,20 +144,36 @@ struct shape : static_array_base<index_t, csizeseq_t<dims>>
return false;
}
- shape add_inf(const shape& other) const
+ friend shape add_shape(const shape& lhs, const shape& rhs)
{
- vec<index_t, dims> x = **this;
- vec<index_t, dims> y = *other;
- mask<index_t, dims> inf = (x == infinite_size) || (y == infinite_size);
+ vec<index_t, dims> x = *lhs;
+ vec<index_t, dims> y = *rhs;
+ mask<index_t, dims> inf = max(x, y) == infinite_size;
return select(inf, infinite_size, x + y);
}
- shape sub_inf(const shape& other) const
+ friend shape sub_shape(const shape& lhs, const shape& rhs)
{
- vec<index_t, dims> x = **this;
- vec<index_t, dims> y = *other;
- mask<index_t, dims> inf = (x == infinite_size) || (y == infinite_size);
+ vec<index_t, dims> x = *lhs;
+ vec<index_t, dims> y = *rhs;
+ mask<index_t, dims> inf = max(x, y) == infinite_size;
return select(inf, infinite_size, x - y);
}
+ friend shape add_shape_undef(const shape& lhs, const shape& rhs)
+ {
+ vec<index_t, dims> x = *lhs;
+ vec<index_t, dims> y = *rhs;
+ mask<index_t, dims> inf = max(x, y) == infinite_size;
+ mask<index_t, dims> undef = min(x, y) == undefined_size;
+ return select(inf, infinite_size, select(undef, undefined_size, x + y));
+ }
+ friend shape sub_shape_undef(const shape& lhs, const shape& rhs)
+ {
+ vec<index_t, dims> x = *lhs;
+ vec<index_t, dims> y = *rhs;
+ mask<index_t, dims> inf = max(x, y) == infinite_size;
+ mask<index_t, dims> undef = min(x, y) == undefined_size;
+ return select(inf, infinite_size, select(undef, undefined_size, x - y));
+ }
friend shape min(const shape& x, const shape& y) { return kfr::min(*x, *y); }
diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp
@@ -1,103 +0,0 @@
-/** @addtogroup utility
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../math/min_max.hpp"
-#include "../simd/shuffle.hpp"
-#include "../simd/vec.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-/**
- * @brief Sort the elements in the vector in ascending order
- * @param x input vector
- * @return sorted vector
- * @code
- * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(-10, 1, 2, 1000));
- * @endcode
- */
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x)
-{
- constexpr size_t Nhalf = N / 2;
- vec<T, Nhalf> e = low(x);
- vec<T, Nhalf> o = high(x);
- constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
- for (size_t i = 0; i < Nhalf; i++)
- {
- vec<T, Nhalf> t;
- t = min(e, o);
- o = max(e, o);
- o = rotateright<1>(o);
- e = t;
- t = max(e, o);
- o = min(e, o);
- e = t;
- t = blend(e, o, blend0);
- o = blend(o, e, blend0);
- o = rotateleft<1>(o);
- e = t;
- }
- return interleavehalves(concat(e, o));
-}
-
-/**
- * @brief Sort the elements in the vector in descending order
- * @param x input vector
- * @return sorted vector
- * @code
- * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(1000, 2, 1, -10));
- * @endcode
- */
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x)
-{
- constexpr size_t Nhalf = N / 2;
- vec<T, Nhalf> e = low(x);
- vec<T, Nhalf> o = high(x);
- constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
- for (size_t i = 0; i < Nhalf; i++)
- {
- vec<T, Nhalf> t;
- t = max(e, o);
- o = min(e, o);
- o = rotateright<1>(o);
- e = t;
- t = min(e, o);
- o = max(e, o);
- e = t;
- t = blend(e, o, blend0);
- o = blend(o, e, blend0);
- o = rotateleft<1>(o);
- e = t;
- }
- return interleavehalves(concat(e, o));
-}
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/base/tensor.hpp b/include/kfr/base/tensor.hpp
@@ -29,8 +29,8 @@
#include "../cometa/array.hpp"
-#include "../math/logical.hpp"
-#include "../math/min_max.hpp"
+#include "../simd/logical.hpp"
+#include "../simd/min_max.hpp"
#include "../simd/horizontal.hpp"
#include "../simd/impl/function.hpp"
#include "../simd/read_write.hpp"
@@ -914,7 +914,7 @@ struct expression_traits<tensor<T, Dims>> : expression_traits_defaults
{
return self.shape();
}
- KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ 0 }; }
+ KFR_MEM_INTRINSIC constexpr static shape<dims> shapeof() { return shape<dims>{ undefined_size }; }
};
inline namespace CMT_ARCH_NAME
diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp
@@ -24,22 +24,15 @@
#include "simd.hpp"
-#include "math/abs.hpp"
#include "math/asin_acos.hpp"
#include "math/atan.hpp"
-#include "math/clamp.hpp"
#include "math/compiletime.hpp"
#include "math/complex_math.hpp"
#include "math/gamma.hpp"
#include "math/hyperbolic.hpp"
#include "math/interpolation.hpp"
#include "math/log_exp.hpp"
-#include "math/logical.hpp"
-#include "math/min_max.hpp"
#include "math/modzerobessel.hpp"
-#include "math/round.hpp"
-#include "math/saturation.hpp"
-#include "math/select.hpp"
#include "math/sin_cos.hpp"
#include "math/sqrt.hpp"
#include "math/tan.hpp"
diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp
@@ -26,12 +26,12 @@
#pragma once
#include "../simd/complex.hpp"
-#include "abs.hpp"
+#include "../simd/abs.hpp"
#include "atan.hpp"
#include "hyperbolic.hpp"
#include "log_exp.hpp"
-#include "min_max.hpp"
-#include "select.hpp"
+#include "../simd/min_max.hpp"
+#include "../simd/select.hpp"
#include "sin_cos.hpp"
#include "sqrt.hpp"
diff --git a/include/kfr/math/impl/abs.hpp b/include/kfr/math/impl/abs.hpp
@@ -1,138 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/select.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
-{
- return x & special_constants<T>::invhighbitmask();
-}
-
-KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT
-{
- const __m128i sh = _mm_srai_epi32(x.v, 31);
- const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
- return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk);
-}
-KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); }
-KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); }
-KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); }
-KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; }
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT
-{
- const __m256i sh = _mm256_srai_epi32(x.v, 31);
- const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
- return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk);
-}
-KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); }
-KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); }
-KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); }
-KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); }
-KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); }
-KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); }
-KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); }
-KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; }
-#endif
-
-KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); }
-KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); }
-KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); }
-#if defined CMT_ARCH_NEON64
-KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); }
-#else
-KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); }
-#endif
-
-KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; }
-KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; }
-
-KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); }
-#if defined CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); }
-#else
-KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT
-{
- return x & special_constants<f64>::invhighbitmask();
-}
-#endif
-
-KFR_HANDLE_ALL_SIZES_1(abs)
-
-#else
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
-{
- return x & special_constants<T>::invhighbitmask();
-}
-
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
-{
- return select(x >= T(0), x, -x);
-}
-#endif
-KFR_HANDLE_SCALAR(abs)
-} // namespace intrinsics
-
-KFR_I_FN(abs)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/asin_acos.hpp b/include/kfr/math/impl/asin_acos.hpp
@@ -22,10 +22,10 @@
*/
#pragma once
-#include "../../math/atan.hpp"
-#include "../../math/select.hpp"
-#include "../../math/sqrt.hpp"
#include "../../simd/impl/function.hpp"
+#include "../../simd/select.hpp"
+#include "../atan.hpp"
+#include "../sqrt.hpp"
namespace kfr
{
diff --git a/include/kfr/math/impl/atan.hpp b/include/kfr/math/impl/atan.hpp
@@ -21,9 +21,9 @@
See https://www.kfrlib.com for details.
*/
#pragma once
-#include "../../math/abs.hpp"
-#include "../../math/select.hpp"
-#include "../../math/sin_cos.hpp"
+#include "../../simd/abs.hpp"
+#include "../../simd/select.hpp"
+#include "../sin_cos.hpp"
#include "../../simd/constants.hpp"
#include "../../simd/impl/function.hpp"
#include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/clamp.hpp b/include/kfr/math/impl/clamp.hpp
@@ -1,55 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/min_max.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-template <typename T>
-KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi)
-{
- return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
-{
- return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
-{
- return max(min(x, hi), zerovector<T, N>());
-}
-} // namespace intrinsics
-KFR_I_FN(clamp)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/hyperbolic.hpp b/include/kfr/math/impl/hyperbolic.hpp
@@ -22,10 +22,10 @@
*/
#pragma once
-#include "../../math/abs.hpp"
-#include "../../math/log_exp.hpp"
-#include "../../math/min_max.hpp"
-#include "../../math/select.hpp"
+#include "../../simd/abs.hpp"
+#include "../log_exp.hpp"
+#include "../../simd/min_max.hpp"
+#include "../../simd/select.hpp"
#include "../../simd/constants.hpp"
#include "../../simd/impl/function.hpp"
#include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/log_exp.hpp b/include/kfr/math/impl/log_exp.hpp
@@ -22,11 +22,11 @@
*/
#pragma once
-#include "../../math/abs.hpp"
-#include "../../math/clamp.hpp"
-#include "../../math/min_max.hpp"
-#include "../../math/round.hpp"
-#include "../../math/select.hpp"
+#include "../../simd/abs.hpp"
+#include "../../simd/clamp.hpp"
+#include "../../simd/min_max.hpp"
+#include "../../simd/round.hpp"
+#include "../../simd/select.hpp"
#include "../../simd/constants.hpp"
#include "../../simd/impl/function.hpp"
#include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp
@@ -1,284 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/abs.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-#if defined CMT_ARCH_SSE41
-
-// horizontal OR
-KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); }
-
-// horizontal AND
-KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
-#endif
-
-#if defined CMT_ARCH_AVX
-// horizontal OR
-KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); }
-
-KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
-
-KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
-
-// horizontal AND
-KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
-
-KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
-
-KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
-
-#if defined CMT_ARCH_AVX512
-// horizontal OR
-KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
-KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); }
-KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); }
-
-// horizontal AND
-KFR_INTRINSIC bool bittestall(const mf32avx512& x)
-{
- return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v));
-}
-KFR_INTRINSIC bool bittestall(const mf64avx512& x)
-{
- return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v));
-}
-KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
-KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
-KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
-
-#endif
-
-#elif defined CMT_ARCH_SSE41
-KFR_INTRINSIC bool bittestany(const mf32sse& x)
-{
- return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
-}
-KFR_INTRINSIC bool bittestany(const mf64sse& x)
-{
- return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
-}
-KFR_INTRINSIC bool bittestall(const mf32sse& x)
-{
- return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
-}
-KFR_INTRINSIC bool bittestall(const mf64sse& x)
-{
- return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
-}
-#endif
-
-#if !defined CMT_ARCH_SSE41
-
-KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); }
-KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); }
-KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); }
-KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); }
-
-KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); }
-KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); }
-KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); }
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
- return bittestall(expand_simd(a, bit<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
- return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
- return bittestany(expand_simd(a, bit<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
- return bittestany(low(a)) || bittestany(high(a));
-}
-
-#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC bool bittestall(const mu32neon& a)
-{
- const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
- return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
-}
-
-KFR_INTRINSIC bool bittestany(const mu32neon& a)
-{
- const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
- return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
-}
-KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
-
-KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
- return bittestall(expand_simd(a, bit<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
-{
- return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
- return bittestany(expand_simd(a, bit<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
-{
- return bittestany(low(a)) || bittestany(high(a));
-}
-
-#else
-
-template <typename T, size_t N>
-KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x)
-{
- typename bitmask<N>::type val = 0;
- for (size_t i = 0; i < N; i++)
- {
- val |= static_cast<int>(x[i]) << i;
- }
- return val;
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& x)
-{
- return getmask(x).value;
-}
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y)
-{
- return bittestany(x & y);
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& x)
-{
- return !getmask(~x).value;
-}
-template <typename T, size_t N>
-KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y)
-{
- return !bittestany(~x & y);
-}
-#endif
-} // namespace intrinsics
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/min_max.hpp b/include/kfr/math/impl/min_max.hpp
@@ -1,236 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/abs.hpp"
-#include "../../math/select.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); }
-KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); }
-KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); }
-KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); }
-
-KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); }
-KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); }
-KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); }
-KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); }
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); }
-
-KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); }
-
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); }
-KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); }
-
-KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); }
-KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); }
-KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); }
-KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); }
-KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); }
-KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); }
-KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); }
-KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); }
-
-KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); }
-KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); }
-KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); }
-
-KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); }
-KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); }
-KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); }
-KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); }
-#else
-KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
-KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
-#endif
-
-#if defined CMT_ARCH_AVX
-KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); }
-KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); }
-KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); }
-#endif
-
-#if defined CMT_ARCH_SSE41
-KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); }
-KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); }
-KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); }
-
-KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); }
-KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); }
-KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); }
-#else
-KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
-
-KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
-
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); }
-KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); }
-KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); }
-KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); }
-KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); }
-KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); }
-KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
-KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
-
-KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); }
-KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); }
-KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); }
-KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); }
-KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); }
-KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); }
-KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
-KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
-
-KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); }
-KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); }
-#if defined CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); }
-KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); }
-#else
-KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
-KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
-{
- return select(x < y, x, y);
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
-{
- return select(x > y, x, y);
-}
-#endif
-
-template <typename T>
-KFR_INTRINSIC T min(initialvalue<T>)
-{
- return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
- : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_INTRINSIC T max(initialvalue<T>)
-{
- return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
- : std::numeric_limits<T>::min();
-}
-template <typename T>
-KFR_INTRINSIC T absmin(initialvalue<T>)
-{
- return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
- : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_INTRINSIC T absmax(initialvalue<T>)
-{
- return 0;
-}
-
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
-{
- return min(abs(x), abs(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
-{
- return max(abs(x), abs(y));
-}
-
-KFR_HANDLE_SCALAR(min)
-KFR_HANDLE_SCALAR(max)
-KFR_HANDLE_SCALAR(absmin)
-KFR_HANDLE_SCALAR(absmax)
-} // namespace intrinsics
-KFR_I_FN(min)
-KFR_I_FN(max)
-KFR_I_FN(absmin)
-KFR_I_FN(absmax)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/round.hpp b/include/kfr/math/impl/round.hpp
@@ -1,282 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-#include "abs.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ss(V) \
- _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_sd(V) \
- _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
-#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
-
-#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); }
-KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); }
-KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); }
-KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); }
-KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); }
-KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); }
-KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); }
-KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); }
-KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); }
-KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); }
-
-#if defined CMT_ARCH_AVX
-
-KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); }
-KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); }
-KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); }
-KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); }
-KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); }
-KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); }
-KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); }
-KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); }
-KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); }
-KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); }
-
-#endif
-
-#if defined CMT_ARCH_AVX512
-
-KFR_INTRINSIC f32avx512 floor(const f32avx512& value)
-{
- return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 ceil(const f32avx512& value)
-{
- return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 trunc(const f32avx512& value)
-{
- return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 round(const f32avx512& value)
-{
- return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 floor(const f64avx512& value)
-{
- return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 ceil(const f64avx512& value)
-{
- return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 trunc(const f64avx512& value)
-{
- return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f64avx512 round(const f64avx512& value)
-{
- return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); }
-KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>)
-KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>)
-
-#else
-
-// fallback
-
-template <typename T>
-constexpr inline T fp_precision_limit = 4503599627370496.0;
-template <>
-constexpr inline f32 fp_precision_limit<f32> = 16777216.0f;
-
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x)
-{
- vec<f32, N> t = innercast<f32>(innercast<i32>(x));
- return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x)
-{
- vec<f64, N> t = innercast<f64>(innercast<i64>(x));
- return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x)
-{
- vec<f32, N> t = innercast<f32>(innercast<i32>(x));
- return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x)
-{
- vec<f64, N> t = innercast<f64>(innercast<i64>(x));
- return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x)
-{
- return select(abs(x) >= fp_precision_limit<f32>, x,
- innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x))));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x)
-{
- return select(abs(x) >= fp_precision_limit<f64>, x,
- innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x))));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x)
-{
- return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x)));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x)
-{
- return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x)));
-}
-template <size_t N>
-KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x)
-{
- return x - floor(x);
-}
-template <size_t N>
-KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x)
-{
- return x - floor(x);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
-KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&)
-{
- return T(0);
-}
-
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value)
-{
- return innercast<IT>(floor(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value)
-{
- return innercast<IT>(ceil(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value)
-{
- return innercast<IT>(trunc(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value)
-{
- return innercast<IT>(round(value));
-}
-
-KFR_HANDLE_SCALAR(floor)
-KFR_HANDLE_SCALAR(ceil)
-KFR_HANDLE_SCALAR(round)
-KFR_HANDLE_SCALAR(trunc)
-KFR_HANDLE_SCALAR(fract)
-KFR_HANDLE_SCALAR(ifloor)
-KFR_HANDLE_SCALAR(iceil)
-KFR_HANDLE_SCALAR(iround)
-KFR_HANDLE_SCALAR(itrunc)
-} // namespace intrinsics
-KFR_I_FN(floor)
-KFR_I_FN(ceil)
-KFR_I_FN(round)
-KFR_I_FN(trunc)
-KFR_I_FN(fract)
-KFR_I_FN(ifloor)
-KFR_I_FN(iceil)
-KFR_I_FN(iround)
-KFR_I_FN(itrunc)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
-
-#undef KFR_mm_trunc_ps
-#undef KFR_mm_roundnearest_ps
-#undef KFR_mm_trunc_pd
-#undef KFR_mm_roundnearest_pd
-#undef KFR_mm_trunc_ss
-#undef KFR_mm_roundnearest_ss
-#undef KFR_mm_trunc_sd
-#undef KFR_mm_roundnearest_sd
-#undef KFR_mm_floor_ss
-#undef KFR_mm_floor_sd
-#undef KFR_mm_ceil_ss
-#undef KFR_mm_ceil_sd
-#undef KFR_mm256_trunc_ps
-#undef KFR_mm256_roundnearest_ps
-#undef KFR_mm256_trunc_pd
-#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/math/impl/saturation.hpp b/include/kfr/math/impl/saturation.hpp
@@ -1,205 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../math/select.hpp"
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-// Generic functions
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
-{
- using UT = utype<T>;
- constexpr size_t shift = typebits<UT>::bits - 1;
- vec<UT, N> aa = bitcast<UT>(a);
- vec<UT, N> bb = bitcast<UT>(b);
- const vec<UT, N> sum = aa + bb;
- aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
- return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
- using UT = utype<T>;
- constexpr size_t shift = typebits<UT>::bits - 1;
- vec<UT, N> aa = bitcast<UT>(a);
- vec<UT, N> bb = bitcast<UT>(b);
- const vec<UT, N> diff = aa - bb;
- aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
- return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
-{
- const vec<T, N> t = allonesvector(a);
- return select(a > t - b, t, a + b);
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return select(a < b, zerovector(a), a - b);
-}
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); }
-KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); }
-KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); }
-
-KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); }
-KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); }
-KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); }
-KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); }
-
-KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
-KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
-
-KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
-KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); }
-
-KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); }
-
-KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
-KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
-
-KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
-KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); }
-KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); }
-KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); }
-KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); }
-KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); }
-
-KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
-KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b)
-{
- return saturated_unsigned_add(a, b);
-}
-KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b)
-{
- return saturated_unsigned_add(a, b);
-}
-KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
-KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b)
-{
- return saturated_unsigned_sub(a, b);
-}
-KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b)
-{
- return saturated_unsigned_sub(a, b);
-}
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); }
-KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); }
-KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); }
-KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); }
-KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); }
-KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); }
-KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); }
-KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); }
-
-KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); }
-KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); }
-KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); }
-KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); }
-KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); }
-KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); }
-KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); }
-KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); }
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#else
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
-KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_signed_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
-KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_unsigned_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
-KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_signed_sub(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
-KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_unsigned_sub(a, b);
-}
-#endif
-KFR_HANDLE_SCALAR(satadd)
-KFR_HANDLE_SCALAR(satsub)
-} // namespace intrinsics
-KFR_I_FN(satadd)
-KFR_I_FN(satsub)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp
@@ -1,331 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 2 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../../simd/impl/function.hpp"
-#include "../../simd/operators.hpp"
-
-namespace kfr
-{
-inline namespace CMT_ARCH_NAME
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
-{
- return _mm_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
-{
- return _mm_blendv_ps(y.v, x.v, m.v);
-}
-KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
-{
- return _mm_blendv_pd(y.v, x.v, m.v);
-}
-
-#if defined CMT_ARCH_AVX
-KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
-{
- return _mm256_blendv_pd(y.v, x.v, m.v);
-}
-KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
-{
- return _mm256_blendv_ps(y.v, x.v, m.v);
-}
-#endif
-
-#if defined CMT_ARCH_AVX2
-KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
-{
- return _mm256_blendv_epi8(y.v, x.v, m.v);
-}
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y)
-{
- return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
-}
-KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y)
-{
- return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
-}
-KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y)
-{
- return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y)
-{
- return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y)
-{
- return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y)
-{
- return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y)
-{
- return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y)
-{
- return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y)
-{
- return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
-}
-KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y)
-{
- return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- constexpr size_t Nout = next_simd_width<T>(N);
- return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
- .shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
- // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
-{
- constexpr size_t Nout = next_simd_width<T>(N);
- return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
-{
- return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
-{
- constexpr size_t Nout = next_simd_width<T>(N);
- return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
-{
- return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
-{
- constexpr size_t Nout = next_simd_width<T>(N);
- return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
- .shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
-{
- return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
-}
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
-{
- return vbslq_f32(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y)
-{
- return vbslq_s8(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y)
-{
- return vbslq_u8(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
-{
- return vbslq_s16(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
-{
- return vbslq_u16(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
-{
- return vbslq_s32(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
-{
- return vbslq_u32(m.v, x.v, y.v);
-}
-KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
-{
- return vbslq_s64(m.v, x.v, y.v);
-}
-KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
-{
- return vbslq_u64(m.v, x.v, y.v);
-}
-
-#ifdef CMT_ARCH_NEON64
-KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
-{
- return vbslq_f64(m.v, x.v, y.v);
-}
-#else
-KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
-{
- return y ^ ((x ^ y) & m.asvec());
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- constexpr size_t Nout = next_simd_width<T>(N);
- return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
- .shuffle(csizeseq<N>);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
-{
- return select(m, vec<T, N>(x), vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
-{
- return select(m, x, vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
-{
- return select(m, vec<T, N>(x), y);
-}
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y)
-{
- return y ^ ((x ^ y) & m.asvec());
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
-{
- return select(m, vec<T, N>(x), vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
-{
- return select(m, x, vec<T, N>(y));
-}
-template <typename T, size_t N>
-KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
-{
- return select(m, vec<T, N>(x), y);
-}
-#endif
-template <typename T1, typename T2>
-KFR_INTRINSIC common_type<T1, T2> select(bool m, const T1& x, const T2& y)
-{
- return m ? x : y;
-}
-
-} // namespace intrinsics
-KFR_I_FN(select)
-} // namespace CMT_ARCH_NAME
-} // namespace kfr
diff --git a/include/kfr/math/impl/sin_cos.hpp b/include/kfr/math/impl/sin_cos.hpp
@@ -22,10 +22,10 @@
*/
#pragma once
-#include "../../math/abs.hpp"
-#include "../../math/min_max.hpp"
-#include "../../math/round.hpp"
-#include "../../math/select.hpp"
+#include "../../simd/abs.hpp"
+#include "../../simd/min_max.hpp"
+#include "../../simd/round.hpp"
+#include "../../simd/select.hpp"
#include "../../simd/constants.hpp"
#include "../../simd/impl/function.hpp"
#include "../../simd/operators.hpp"
diff --git a/include/kfr/math/impl/tan.hpp b/include/kfr/math/impl/tan.hpp
@@ -22,12 +22,12 @@
*/
#pragma once
-#include "../../math/abs.hpp"
-#include "../../math/select.hpp"
-#include "../../math/sin_cos.hpp"
+#include "../../simd/abs.hpp"
#include "../../simd/constants.hpp"
#include "../../simd/impl/function.hpp"
#include "../../simd/operators.hpp"
+#include "../../simd/select.hpp"
+#include "../sin_cos.hpp"
namespace kfr
{
diff --git a/include/kfr/math/interpolation.hpp b/include/kfr/math/interpolation.hpp
@@ -25,7 +25,7 @@
*/
#pragma once
-#include "select.hpp"
+#include "../simd/select.hpp"
#include "sin_cos.hpp"
namespace kfr
diff --git a/include/kfr/simd.hpp b/include/kfr/simd.hpp
@@ -22,15 +22,23 @@
*/
#pragma once
+#include "simd/abs.hpp"
+#include "simd/clamp.hpp"
#include "simd/comparison.hpp"
#include "simd/complex.hpp"
#include "simd/constants.hpp"
#include "simd/digitreverse.hpp"
#include "simd/horizontal.hpp"
+#include "simd/logical.hpp"
#include "simd/mask.hpp"
+#include "simd/min_max.hpp"
#include "simd/operators.hpp"
#include "simd/platform.hpp"
#include "simd/read_write.hpp"
+#include "simd/round.hpp"
+#include "simd/saturation.hpp"
+#include "simd/select.hpp"
#include "simd/shuffle.hpp"
+#include "simd/sort.hpp"
#include "simd/types.hpp"
#include "simd/vec.hpp"
diff --git a/include/kfr/math/abs.hpp b/include/kfr/simd/abs.hpp
diff --git a/include/kfr/math/clamp.hpp b/include/kfr/simd/clamp.hpp
diff --git a/include/kfr/simd/impl/abs.hpp b/include/kfr/simd/impl/abs.hpp
@@ -0,0 +1,138 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../select.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return x & special_constants<T>::invhighbitmask();
+}
+
+KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT
+{
+ const __m128i sh = _mm_srai_epi32(x.v, 31);
+ const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+ return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk);
+}
+KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); }
+KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); }
+KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); }
+KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT
+{
+ const __m256i sh = _mm256_srai_epi32(x.v, 31);
+ const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+ return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk);
+}
+KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); }
+KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); }
+KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); }
+KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); }
+#else
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); }
+#endif
+
+KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; }
+
+KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); }
+#else
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT
+{
+ return x & special_constants<f64>::invhighbitmask();
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_1(abs)
+
+#else
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return x & special_constants<T>::invhighbitmask();
+}
+
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return select(x >= T(0), x, -x);
+}
+#endif
+KFR_HANDLE_SCALAR(abs)
+} // namespace intrinsics
+
+KFR_I_FN(abs)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/clamp.hpp b/include/kfr/simd/impl/clamp.hpp
@@ -0,0 +1,55 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../min_max.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T>
+KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi)
+{
+ return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
+{
+ return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
+{
+ return max(min(x, hi), zerovector<T, N>());
+}
+} // namespace intrinsics
+KFR_I_FN(clamp)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/logical.hpp b/include/kfr/simd/impl/logical.hpp
@@ -0,0 +1,284 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../abs.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+#if defined CMT_ARCH_SSE41
+
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+#endif
+
+#if defined CMT_ARCH_AVX
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return !_mm_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return !_mm_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const mf32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mf64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const mu8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const mf32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mf64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const mu8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mu64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const mi64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+
+#if defined CMT_ARCH_AVX512
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const mf32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mf64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const mu8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64avx512& x) { return _mm512_movepi64_mask(x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const mf32avx512& x)
+{
+ return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mf64avx512& x)
+{
+ return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v));
+}
+KFR_INTRINSIC bool bittestall(const mu8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mu32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mu64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const mi32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const mi64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+
+#endif
+
+#elif defined CMT_ARCH_SSE41
+KFR_INTRINSIC bool bittestany(const mf32sse& x)
+{
+ return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
+}
+KFR_INTRINSIC bool bittestany(const mf64sse& x)
+{
+ return !_mm_testz_si128(bitcast<bit<u8>>(x).v, bitcast<bit<u8>>(x).v);
+}
+KFR_INTRINSIC bool bittestall(const mf32sse& x)
+{
+ return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
+}
+KFR_INTRINSIC bool bittestall(const mf64sse& x)
+{
+ return _mm_testc_si128(bitcast<bit<u8>>(x).v, allonesvector(bitcast<bit<u8>>(x)).v);
+}
+#endif
+
+#if !defined CMT_ARCH_SSE41
+
+KFR_INTRINSIC bool bittestany(const mf32sse& x) { return _mm_movemask_ps(x.v); }
+KFR_INTRINSIC bool bittestany(const mf64sse& x) { return _mm_movemask_pd(x.v); }
+KFR_INTRINSIC bool bittestany(const mu8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mu64sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const mi64sse& x) { return _mm_movemask_epi8(x.v); }
+
+KFR_INTRINSIC bool bittestall(const mf32sse& x) { return !_mm_movemask_ps((~x).v); }
+KFR_INTRINSIC bool bittestall(const mf64sse& x) { return !_mm_movemask_pd((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mu64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const mi64sse& x) { return !_mm_movemask_epi8((~x).v); }
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+ return bittestall(expand_simd(a, bit<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+ return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+ return bittestany(expand_simd(a, bit<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+ return bittestany(low(a)) || bittestany(high(a));
+}
+
+#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC bool bittestall(const mu32neon& a)
+{
+ const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+ return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
+}
+
+KFR_INTRINSIC bool bittestany(const mu32neon& a)
+{
+ const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+ return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
+}
+KFR_INTRINSIC bool bittestany(const mu8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mu16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mu64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mi8neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mi16neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mi64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mf32neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestany(const mf64neon& a) { return bittestany(bitcast<bit<u32>>(a)); }
+
+KFR_INTRINSIC bool bittestall(const mu8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mu16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mu64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mi8neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mi16neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mi64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mf32neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+KFR_INTRINSIC bool bittestall(const mf64neon& a) { return bittestall(bitcast<bit<u32>>(a)); }
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+ return bittestall(expand_simd(a, bit<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& a)
+{
+ return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+ return bittestany(expand_simd(a, bit<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& a)
+{
+ return bittestany(low(a)) || bittestany(high(a));
+}
+
+#else
+
+template <typename T, size_t N>
+KFR_INTRINSIC bitmask<N> getmask(const mask<T, N>& x)
+{
+ typename bitmask<N>::type val = 0;
+ for (size_t i = 0; i < N; i++)
+ {
+ val |= static_cast<int>(x[i]) << i;
+ }
+ return val;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x)
+{
+ return getmask(x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const mask<T, N>& x, const mask<T, N>& y)
+{
+ return bittestany(x & y);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x)
+{
+ return !getmask(~x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const mask<T, N>& x, const mask<T, N>& y)
+{
+ return !bittestany(~x & y);
+}
+#endif
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/min_max.hpp b/include/kfr/simd/impl/min_max.hpp
@@ -0,0 +1,236 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../abs.hpp"
+#include "../select.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); }
+
+KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); }
+KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); }
+#else
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
+#endif
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); }
+#endif
+
+#if defined CMT_ARCH_SSE41
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); }
+#else
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
+
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
+
+KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); }
+KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); }
+#else
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return select(x < y, x, y);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return select(x > y, x, y);
+}
+#endif
+
+template <typename T>
+KFR_INTRINSIC T min(initialvalue<T>)
+{
+ return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+ : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T max(initialvalue<T>)
+{
+ return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
+ : std::numeric_limits<T>::min();
+}
+template <typename T>
+KFR_INTRINSIC T absmin(initialvalue<T>)
+{
+ return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+ : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T absmax(initialvalue<T>)
+{
+ return 0;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return min(abs(x), abs(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return max(abs(x), abs(y));
+}
+
+KFR_HANDLE_SCALAR(min)
+KFR_HANDLE_SCALAR(max)
+KFR_HANDLE_SCALAR(absmin)
+KFR_HANDLE_SCALAR(absmax)
+} // namespace intrinsics
+KFR_I_FN(min)
+KFR_I_FN(max)
+KFR_I_FN(absmin)
+KFR_I_FN(absmax)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/round.hpp b/include/kfr/simd/impl/round.hpp
@@ -0,0 +1,282 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "function.hpp"
+#include "../operators.hpp"
+#include "abs.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ss(V) \
+ _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_sd(V) \
+ _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
+#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
+
+#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); }
+KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); }
+KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); }
+KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); }
+KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); }
+KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); }
+KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); }
+KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); }
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); }
+KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); }
+KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); }
+KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); }
+KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); }
+KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); }
+KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+
+KFR_INTRINSIC f32avx512 floor(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 ceil(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 trunc(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 round(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 floor(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 ceil(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 trunc(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 round(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>)
+KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>)
+
+#else
+
+// fallback
+
+template <typename T>
+constexpr inline T fp_precision_limit = 4503599627370496.0;
+template <>
+constexpr inline f32 fp_precision_limit<f32> = 16777216.0f;
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x)
+{
+ vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+ return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x)
+{
+ vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+ return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x)
+{
+ vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+ return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x)
+{
+ vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+ return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f32>, x,
+ innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f64>, x,
+ innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x)
+{
+ return x - floor(x);
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x)
+{
+ return x - floor(x);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>
+KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&)
+{
+ return T(0);
+}
+
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value)
+{
+ return innercast<IT>(floor(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value)
+{
+ return innercast<IT>(ceil(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value)
+{
+ return innercast<IT>(trunc(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value)
+{
+ return innercast<IT>(round(value));
+}
+
+KFR_HANDLE_SCALAR(floor)
+KFR_HANDLE_SCALAR(ceil)
+KFR_HANDLE_SCALAR(round)
+KFR_HANDLE_SCALAR(trunc)
+KFR_HANDLE_SCALAR(fract)
+KFR_HANDLE_SCALAR(ifloor)
+KFR_HANDLE_SCALAR(iceil)
+KFR_HANDLE_SCALAR(iround)
+KFR_HANDLE_SCALAR(itrunc)
+} // namespace intrinsics
+KFR_I_FN(floor)
+KFR_I_FN(ceil)
+KFR_I_FN(round)
+KFR_I_FN(trunc)
+KFR_I_FN(fract)
+KFR_I_FN(ifloor)
+KFR_I_FN(iceil)
+KFR_I_FN(iround)
+KFR_I_FN(itrunc)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+#undef KFR_mm_trunc_ps
+#undef KFR_mm_roundnearest_ps
+#undef KFR_mm_trunc_pd
+#undef KFR_mm_roundnearest_pd
+#undef KFR_mm_trunc_ss
+#undef KFR_mm_roundnearest_ss
+#undef KFR_mm_trunc_sd
+#undef KFR_mm_roundnearest_sd
+#undef KFR_mm_floor_ss
+#undef KFR_mm_floor_sd
+#undef KFR_mm_ceil_ss
+#undef KFR_mm_ceil_sd
+#undef KFR_mm256_trunc_ps
+#undef KFR_mm256_roundnearest_ps
+#undef KFR_mm256_trunc_pd
+#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/simd/impl/saturation.hpp b/include/kfr/simd/impl/saturation.hpp
@@ -0,0 +1,205 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../select.hpp"
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+// Generic functions
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+ using UT = utype<T>;
+ constexpr size_t shift = typebits<UT>::bits - 1;
+ vec<UT, N> aa = bitcast<UT>(a);
+ vec<UT, N> bb = bitcast<UT>(b);
+ const vec<UT, N> sum = aa + bb;
+ aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+ return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ using UT = utype<T>;
+ constexpr size_t shift = typebits<UT>::bits - 1;
+ vec<UT, N> aa = bitcast<UT>(a);
+ vec<UT, N> bb = bitcast<UT>(b);
+ const vec<UT, N> diff = aa - bb;
+ aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+ return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+ const vec<T, N> t = allonesvector(a);
+ return select(a > t - b, t, a + b);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return select(a < b, zerovector(a), a - b);
+}
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); }
+KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b)
+{
+ return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b)
+{
+ return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b)
+{
+ return saturated_unsigned_sub(a, b);
+}
+KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b)
+{
+ return saturated_unsigned_sub(a, b);
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); }
+
+KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); }
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#else
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_signed_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_unsigned_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_signed<T>)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_signed_sub(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_unsigned<T>)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_unsigned_sub(a, b);
+}
+#endif
+KFR_HANDLE_SCALAR(satadd)
+KFR_HANDLE_SCALAR(satsub)
+} // namespace intrinsics
+KFR_I_FN(satadd)
+KFR_I_FN(satsub)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/select.hpp b/include/kfr/simd/impl/select.hpp
@@ -0,0 +1,331 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "function.hpp"
+#include "../operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
+{
+ return _mm_blendv_ps(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
+{
+ return _mm_blendv_pd(y.v, x.v, m.v);
+}
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
+{
+ return _mm256_blendv_pd(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
+{
+ return _mm256_blendv_ps(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f64avx512 select(const mf64avx512& m, const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC f32avx512 select(const mf32avx512& m, const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC u8avx512 select(const mu8avx512& m, const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u16avx512 select(const mu16avx512& m, const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u32avx512 select(const mu32avx512& m, const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u64avx512 select(const mu64avx512& m, const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i8avx512 select(const mi8avx512& m, const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i16avx512 select(const mi16avx512& m, const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i32avx512 select(const mi32avx512& m, const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i64avx512 select(const mi64avx512& m, const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+ .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
+ // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c)
+{
+ return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c)
+{
+ return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
+ .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c)
+{
+ return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
+}
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
+{
+ return vbslq_f32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y)
+{
+ return vbslq_s8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y)
+{
+ return vbslq_u8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
+{
+ return vbslq_s16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
+{
+ return vbslq_u16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
+{
+ return vbslq_s32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
+{
+ return vbslq_u32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
+{
+ return vbslq_s64(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
+{
+ return vbslq_u64(m.v, x.v, y.v);
+}
+
+#ifdef CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+ return vbslq_f64(m.v, x.v, y.v);
+}
+#else
+KFR_INTRINSIC f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+ return y ^ ((x ^ y) & m.asvec());
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+ .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
+{
+ return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
+{
+ return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
+{
+ return select(m, vec<T, N>(x), y);
+}
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+{
+ return y ^ ((x ^ y) & m.asvec());
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y)
+{
+ return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const vec<T, N>& x, const T& y)
+{
+ return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const vec<T, N>& y)
+{
+ return select(m, vec<T, N>(x), y);
+}
+#endif
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> select(bool m, const T1& x, const T2& y)
+{
+ return m ? x : y;
+}
+
+} // namespace intrinsics
+KFR_I_FN(select)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/logical.hpp b/include/kfr/simd/logical.hpp
diff --git a/include/kfr/math/min_max.hpp b/include/kfr/simd/min_max.hpp
diff --git a/include/kfr/math/round.hpp b/include/kfr/simd/round.hpp
diff --git a/include/kfr/math/saturation.hpp b/include/kfr/simd/saturation.hpp
diff --git a/include/kfr/math/select.hpp b/include/kfr/simd/select.hpp
diff --git a/include/kfr/simd/sort.hpp b/include/kfr/simd/sort.hpp
@@ -0,0 +1,103 @@
+/** @addtogroup utility
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "min_max.hpp"
+#include "shuffle.hpp"
+#include "vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Sort the elements in the vector in ascending order
+ * @param x input vector
+ * @return sorted vector
+ * @code
+ * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(-10, 1, 2, 1000));
+ * @endcode
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x)
+{
+ constexpr size_t Nhalf = N / 2;
+ vec<T, Nhalf> e = low(x);
+ vec<T, Nhalf> o = high(x);
+ constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
+ for (size_t i = 0; i < Nhalf; i++)
+ {
+ vec<T, Nhalf> t;
+ t = min(e, o);
+ o = max(e, o);
+ o = rotateright<1>(o);
+ e = t;
+ t = max(e, o);
+ o = min(e, o);
+ e = t;
+ t = blend(e, o, blend0);
+ o = blend(o, e, blend0);
+ o = rotateleft<1>(o);
+ e = t;
+ }
+ return interleavehalves(concat(e, o));
+}
+
+/**
+ * @brief Sort the elements in the vector in descending order
+ * @param x input vector
+ * @return sorted vector
+ * @code
+ * CHECK(sort(make_vector(1000, 1, 2, -10)) == make_vector(1000, 2, 1, -10));
+ * @endcode
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x)
+{
+ constexpr size_t Nhalf = N / 2;
+ vec<T, Nhalf> e = low(x);
+ vec<T, Nhalf> o = high(x);
+ constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
+ for (size_t i = 0; i < Nhalf; i++)
+ {
+ vec<T, Nhalf> t;
+ t = max(e, o);
+ o = min(e, o);
+ o = rotateright<1>(o);
+ e = t;
+ t = min(e, o);
+ o = max(e, o);
+ e = t;
+ t = blend(e, o, blend0);
+ o = blend(o, e, blend0);
+ o = rotateleft<1>(o);
+ e = t;
+ }
+ return interleavehalves(concat(e, o));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/sources.cmake b/sources.cmake
@@ -27,13 +27,14 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/math_expressions.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/base/old_basic_expressions.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/base/random_bits.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/shape.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/simd_expressions.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/tensor.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/impl/static_array.hpp
@@ -91,65 +92,66 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h
${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_mp3.h
${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h
- ${PROJECT_SOURCE_DIR}/include/kfr/math/abs.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/clamp.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/complex_math.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/gamma.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/hyperbolic.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/interpolation.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/logical.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/log_exp.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/min_max.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/modzerobessel.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/round.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/saturation.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/select.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/sin_cos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/sqrt.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/tan.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/abs.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/asin_acos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/atan.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/clamp.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/gamma.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/hyperbolic.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/logical.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/log_exp.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/min_max.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/modzerobessel.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/round.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/saturation.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/select.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sin_cos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sqrt.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math/impl/tan.hpp
${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid.hpp
${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid_auto.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/abs.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/clamp.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/comparison.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/complex.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/complex_type.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/constants.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/digitreverse.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/horizontal.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/logical.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/mask.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/min_max.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/operators.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/platform.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/read_write.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/round.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/saturation.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/select.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/shuffle.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/sort.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/types.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/vec.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/abs.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_clang.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_generic.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_clang.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_complex.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/clamp.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/logical.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/min_max.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/round.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/saturation.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/select.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specialconstants.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
@@ -187,18 +189,18 @@ set(
${PROJECT_SOURCE_DIR}/tests/unit/base/tensor.cpp
${PROJECT_SOURCE_DIR}/tests/unit/graphics/color.cpp
${PROJECT_SOURCE_DIR}/tests/unit/graphics/geometry.cpp
- ${PROJECT_SOURCE_DIR}/tests/unit/math/abs.cpp
${PROJECT_SOURCE_DIR}/tests/unit/math/asin_acos.cpp
${PROJECT_SOURCE_DIR}/tests/unit/math/atan.cpp
${PROJECT_SOURCE_DIR}/tests/unit/math/hyperbolic.cpp
${PROJECT_SOURCE_DIR}/tests/unit/math/log_exp.cpp
- ${PROJECT_SOURCE_DIR}/tests/unit/math/min_max.cpp
- ${PROJECT_SOURCE_DIR}/tests/unit/math/round.cpp
- ${PROJECT_SOURCE_DIR}/tests/unit/math/select.cpp
${PROJECT_SOURCE_DIR}/tests/unit/math/sin_cos.cpp
${PROJECT_SOURCE_DIR}/tests/unit/math/tan.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/abs.cpp
${PROJECT_SOURCE_DIR}/tests/unit/simd/complex.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/min_max.cpp
${PROJECT_SOURCE_DIR}/tests/unit/simd/operators.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/round.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/select.cpp
${PROJECT_SOURCE_DIR}/tests/unit/simd/shuffle.cpp
${PROJECT_SOURCE_DIR}/tests/unit/simd/vec.cpp
)
diff --git a/tests/unit/math/abs.cpp b/tests/unit/simd/abs.cpp
diff --git a/tests/unit/math/min_max.cpp b/tests/unit/simd/min_max.cpp
diff --git a/tests/unit/math/round.cpp b/tests/unit/simd/round.cpp
diff --git a/tests/unit/math/select.cpp b/tests/unit/simd/select.cpp