commit c9da5fa2939fce901f20629aececfc967673e17c
parent d66dc65dbf6eec4aa56c9ae03e31c915072eb5ed
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Sat, 8 Oct 2016 04:40:58 +0300
Add generic compiler-independent SIMD functions
Diffstat:
10 files changed, 228 insertions(+), 18 deletions(-)
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -35,7 +35,7 @@ namespace kfr
namespace intrinsics
{
-#if defined CMT_ARCH_SSSE3
+#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
// floating point
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
@@ -66,7 +66,7 @@ KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); }
KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); }
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -49,7 +49,7 @@ struct bitmask
type value;
};
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
#if defined CMT_ARCH_SSE41
@@ -161,7 +161,7 @@ KFR_SINTRIN bool bittestany(const vec<T, N>& a)
return bittestany(low(a)) || bittestany(high(a));
}
-#elif CMT_ARCH_NEON
+#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN bool bittestall(const u32neon& a)
{
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -36,7 +36,7 @@ namespace kfr
namespace intrinsics
{
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); }
KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); }
@@ -106,7 +106,7 @@ KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y,
KFR_HANDLE_ALL_SIZES_2(min)
KFR_HANDLE_ALL_SIZES_2(max)
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); }
KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); }
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -54,7 +54,7 @@ namespace intrinsics
#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TRUNC)
#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_NINT)
-#if defined CMT_ARCH_SSE41
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); }
KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); }
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -71,7 +71,7 @@ KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>
return select(a < b, zerovector(a), a - b);
}
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); }
KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); }
@@ -108,7 +108,7 @@ KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs
KFR_HANDLE_ALL_SIZES_2(satadd)
KFR_HANDLE_ALL_SIZES_2(satsub)
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); }
KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); }
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -32,7 +32,7 @@ namespace kfr
namespace intrinsics
{
-#if defined CMT_ARCH_SSE41
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
{
@@ -132,7 +132,7 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<
return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
}
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
{
diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp
@@ -35,6 +35,7 @@ constexpr size_t index_undefined = static_cast<size_t>(-1);
#ifdef CMT_COMPILER_CLANG
#define KFR_NATIVE_SIMD 1
+#define KFR_NATIVE_INTRINSICS 1
#endif
#ifdef KFR_NATIVE_SIMD
@@ -100,6 +101,215 @@ CMT_INLINE void simd_write(T* dest, const simd<T, N>& value)
#define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X))
#define KFR_SIMD_SHUFFLE(X, Y, ...) __builtin_shufflevector(X, Y, __VA_ARGS__)
-#endif
+#else
+
+namespace internal
+{
+
+template <typename T>
+struct simd_float_ops
+{
+ constexpr static T neg(T x) { return -x; }
+ constexpr static T bnot(T x) { return ~x; }
+
+ constexpr static T add(T x, T y) { return x + y; }
+ constexpr static T sub(T x, T y) { return x - y; }
+ constexpr static T mul(T x, T y) { return x * y; }
+ constexpr static T div(T x, T y) { return x / y; }
+ constexpr static T rem(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+ constexpr static T band(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+ constexpr static T bor(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+ constexpr static T bxor(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+ constexpr static T shl(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+ constexpr static T shr(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+
+ constexpr static T eq(T x, T y) { return maskbits<T>(x == y); }
+ constexpr static T ne(T x, T y) { return maskbits<T>(x != y); }
+ constexpr static T lt(T x, T y) { return maskbits<T>(x < y); }
+ constexpr static T gt(T x, T y) { return maskbits<T>(x > y); }
+ constexpr static T le(T x, T y) { return maskbits<T>(x <= y); }
+ constexpr static T ge(T x, T y) { return maskbits<T>(x >= y); }
+};
+template <typename T>
+struct simd_int_ops : simd_float_ops<T>
+{
+ constexpr static T rem(T x, T y) { return x % y; }
+ constexpr static T band(T x, T y) { return x & y; }
+ constexpr static T bor(T x, T y) { return x | y; }
+ constexpr static T bxor(T x, T y) { return x ^ y; }
+ constexpr static T shl(T x, T y) { return x << y; }
+ constexpr static T shr(T x, T y) { return x >> y; }
+};
+}
+
+template <typename T, size_t N>
+struct alignas(next_poweroftwo(N * sizeof(T))) simd
+{
+ using ops =
+ conditional<std::is_floating_point<T>::value, internal::simd_float_ops<T>, internal::simd_int_ops<T>>;
+ constexpr static simd broadcast(T value) { return broadcast_impl(value, csizeseq<N>); }
+ constexpr friend simd operator+(const simd& x) { return x; }
+ constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, csizeseq<N>); }
+ constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, csizeseq<N>); }
+
+ constexpr friend simd operator+(const simd& x, const simd& y)
+ {
+ return op_impl<ops::add>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator-(const simd& x, const simd& y)
+ {
+ return op_impl<ops::sub>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator*(const simd& x, const simd& y)
+ {
+ return op_impl<ops::mul>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator/(const simd& x, const simd& y)
+ {
+ return op_impl<ops::div>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator&(const simd& x, const simd& y)
+ {
+ return op_impl<ops::band>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator|(const simd& x, const simd& y)
+ {
+ return op_impl<ops::bor>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator^(const simd& x, const simd& y)
+ {
+ return op_impl<ops::bxor>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator<<(const simd& x, const simd& y)
+ {
+ return op_impl<ops::shl>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator>>(const simd& x, const simd& y)
+ {
+ return op_impl<ops::shr>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator==(const simd& x, const simd& y)
+ {
+ return op_impl<ops::eq>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator!=(const simd& x, const simd& y)
+ {
+ return op_impl<ops::ne>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator<(const simd& x, const simd& y)
+ {
+ return op_impl<ops::lt>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator>(const simd& x, const simd& y)
+ {
+ return op_impl<ops::gt>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator<=(const simd& x, const simd& y)
+ {
+ return op_impl<ops::le>(x, y, csizeseq<N>);
+ }
+ constexpr friend simd operator>=(const simd& x, const simd& y)
+ {
+ return op_impl<ops::ge>(x, y, csizeseq<N>);
+ }
+ constexpr T operator[](size_t index) const { return items[index]; }
+ T& operator[](size_t index) { return items[index]; }
+ T items[N];
+
+ template <typename U>
+ constexpr simd<U, N> cast() const
+ {
+ return cast_impl<U>(*this, csizeseq<N>);
+ }
+
+private:
+ template <typename U, size_t... indices>
+ constexpr static simd<U, N> cast_impl(const simd& x, csizes_t<indices...>)
+ {
+ return simd<U, N>{ static_cast<U>(x.items[indices])... };
+ }
+ template <T (*fn)(T), size_t... indices>
+ constexpr static simd op_impl(const simd& x, csizes_t<indices...>)
+ {
+ return simd{ fn(x.items[indices])... };
+ }
+ template <T (*fn)(T, T), size_t... indices>
+ constexpr static simd op_impl(const simd& x, const simd& y, csizes_t<indices...>)
+ {
+ return simd{ fn(x.items[indices], y.items[indices])... };
+ }
+ template <size_t... indices>
+ constexpr static simd broadcast_impl(T value, csizes_t<indices...>)
+ {
+ return simd{ ((void)indices, value)... };
+ }
+};
+
+template <typename To, typename From, size_t N>
+constexpr CMT_INLINE simd<To, N> simd_cast(const simd<From, N>& value) noexcept
+{
+ return value.template cast<To>();
+}
+
+template <typename T, size_t N, int... indices>
+constexpr CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const simd<T, N>& x, const simd<T, N>& y,
+ cints_t<indices...>) noexcept
+{
+ return simd<T, sizeof...(indices)>{ (indices == -1 ? T()
+ : ((indices >= N) ? y[indices - N] : x[indices]))... };
+}
+
+template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)>
+constexpr CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept
+{
+ union {
+ const simd<From, N> from;
+ const simd<To, Nout> to;
+ } u{ value };
+ return u.to;
+}
+
+template <size_t N, typename T>
+CMT_INLINE simd<T, N> simd_read_impl(const T* src, cfalse_t)
+{
+ simd<T, N> temp;
+ internal::builtin_memcpy(temp.items, src, N * sizeof(T));
+ return temp;
+}
+template <size_t N, typename T>
+CMT_INLINE simd<T, N> simd_read_impl(const T* src, ctrue_t)
+{
+ return *ptr_cast<simd<T, N>>(src);
+}
+
+template <size_t N, typename T>
+CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, cfalse_t)
+{
+ internal::builtin_memcpy(dest, value.items, N * sizeof(T));
+}
+template <size_t N, typename T>
+CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, ctrue_t)
+{
+ *ptr_cast<simd<T, N>>(dest) = value;
+}
+
+template <size_t N, bool A = false, typename T>
+CMT_INLINE simd<T, N> simd_read(const T* src)
+{
+ return simd_read_impl<N>(src, cbool<A>);
+}
+
+template <bool A = false, size_t N, typename T>
+CMT_INLINE void simd_write(T* dest, const simd<T, N>& value)
+{
+ return simd_write_impl<N>(dest, value, cbool<A>);
+}
+
+#define KFR_SIMD_CAST(T, N, X) (::kfr::simd_cast<T>(X))
+#define KFR_SIMD_BITCAST(T, N, X) (::kfr::simd_bitcast<T>(X))
+#define KFR_SIMD_BROADCAST(T, N, X) (::kfr::simd<T, N>::broadcast(X))
+#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints<__VA_ARGS__>)
+
+#endif
}
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -138,7 +138,7 @@ KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N
return formula;
}
-template <typename T, size_t N, typename = u8[N > 1]>
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
{
vec<itype<T>, N> quadrant;
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -33,7 +33,7 @@ namespace kfr
namespace intrinsics
{
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); }
KFR_SINTRIN f64x1 sqrt(const f64x1& x)
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -436,21 +436,21 @@ template <typename From, size_t N, typename To = utype<From>,
size_t Nout = size_of<From>() * N / size_of<To>()>
constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept
{
- return reinterpret_cast<simd<To, Nout>>(*value);
+ return KFR_SIMD_BITCAST(To, Nout, *value);
}
template <typename From, size_t N, typename To = itype<From>,
size_t Nout = size_of<From>() * N / size_of<To>()>
constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept
{
- return reinterpret_cast<simd<To, Nout>>(*value);
+ return KFR_SIMD_BITCAST(To, Nout, *value);
}
template <typename From, size_t N, typename To = ftype<From>,
size_t Nout = size_of<From>() * N / size_of<To>()>
constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
{
- return reinterpret_cast<simd<To, Nout>>(*value);
+ return KFR_SIMD_BITCAST(To, Nout, *value);
}
constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
@@ -1259,7 +1259,7 @@ CMT_INLINE vec<T, N> tovec(const mask<T, N>& x)
return *x;
}
-#if defined CMT_ARCH_SSE2 && defined CMT_COMPILER_GNU
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_SIMD
CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); }
CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); }
#endif