Add generic compiler-independent SIMD functions - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit c9da5fa2939fce901f20629aececfc967673e17c
parent d66dc65dbf6eec4aa56c9ae03e31c915072eb5ed
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Sat,  8 Oct 2016 04:40:58 +0300

Add generic compiler-independent SIMD functions

Diffstat:
M include/kfr/base/abs.hpp  | 4 ++--
M include/kfr/base/logical.hpp  | 4 ++--
M include/kfr/base/min_max.hpp  | 4 ++--
M include/kfr/base/round.hpp  | 2 +-
M include/kfr/base/saturation.hpp  | 4 ++--
M include/kfr/base/select.hpp  | 4 ++--
M include/kfr/base/simd.hpp  | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M include/kfr/base/sin_cos.hpp  | 2 +-
M include/kfr/base/sqrt.hpp  | 2 +-
M include/kfr/base/vec.hpp  | 8 ++++----

10 files changed, 228 insertions(+), 18 deletions(-)
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -35,7 +35,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CMT_ARCH_SSSE3
+#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
 
 // floating point
 template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
@@ -66,7 +66,7 @@ KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
 
 KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
 
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); }
 KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); }
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -49,7 +49,7 @@ struct bitmask
     type value;
 };
 
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
 
 #if defined CMT_ARCH_SSE41
 
@@ -161,7 +161,7 @@ KFR_SINTRIN bool bittestany(const vec<T, N>& a)
     return bittestany(low(a)) || bittestany(high(a));
 }
 
-#elif CMT_ARCH_NEON
+#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN bool bittestall(const u32neon& a)
 {
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -36,7 +36,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); }
 KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); }
@@ -106,7 +106,7 @@ KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, 
 KFR_HANDLE_ALL_SIZES_2(min)
 KFR_HANDLE_ALL_SIZES_2(max)
 
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); }
 KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); }
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -54,7 +54,7 @@ namespace intrinsics
 #define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TRUNC)
 #define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_NINT)
 
-#if defined CMT_ARCH_SSE41
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); }
 KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); }
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -71,7 +71,7 @@ KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>
     return select(a < b, zerovector(a), a - b);
 }
 
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); }
 KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); }
@@ -108,7 +108,7 @@ KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs
 KFR_HANDLE_ALL_SIZES_2(satadd)
 KFR_HANDLE_ALL_SIZES_2(satsub)
 
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); }
 KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); }
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -32,7 +32,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CMT_ARCH_SSE41
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
 {
@@ -132,7 +132,7 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<
     return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
 }
 
-#elif defined CMT_ARCH_NEON
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
 {
diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp
@@ -35,6 +35,7 @@ constexpr size_t index_undefined = static_cast<size_t>(-1);
 
 #ifdef CMT_COMPILER_CLANG
 #define KFR_NATIVE_SIMD 1
+#define KFR_NATIVE_INTRINSICS 1
 #endif
 
 #ifdef KFR_NATIVE_SIMD
@@ -100,6 +101,215 @@ CMT_INLINE void simd_write(T* dest, const simd<T, N>& value)
 #define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X))
 #define KFR_SIMD_SHUFFLE(X, Y, ...) __builtin_shufflevector(X, Y, __VA_ARGS__)
 
-#endif
+#else
+
+namespace internal
+{
+
+template <typename T>
+struct simd_float_ops
+{
+    constexpr static T neg(T x) { return -x; }
+    constexpr static T bnot(T x) { return ~x; }
+
+    constexpr static T add(T x, T y) { return x + y; }
+    constexpr static T sub(T x, T y) { return x - y; }
+    constexpr static T mul(T x, T y) { return x * y; }
+    constexpr static T div(T x, T y) { return x / y; }
 
+    constexpr static T rem(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+    constexpr static T band(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+    constexpr static T bor(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+    constexpr static T bxor(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+    constexpr static T shl(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+    constexpr static T shr(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); }
+
+    constexpr static T eq(T x, T y) { return maskbits<T>(x == y); }
+    constexpr static T ne(T x, T y) { return maskbits<T>(x != y); }
+    constexpr static T lt(T x, T y) { return maskbits<T>(x < y); }
+    constexpr static T gt(T x, T y) { return maskbits<T>(x > y); }
+    constexpr static T le(T x, T y) { return maskbits<T>(x <= y); }
+    constexpr static T ge(T x, T y) { return maskbits<T>(x >= y); }
+};
+template <typename T>
+struct simd_int_ops : simd_float_ops<T>
+{
+    constexpr static T rem(T x, T y) { return x % y; }
+    constexpr static T band(T x, T y) { return x & y; }
+    constexpr static T bor(T x, T y) { return x | y; }
+    constexpr static T bxor(T x, T y) { return x ^ y; }
+    constexpr static T shl(T x, T y) { return x << y; }
+    constexpr static T shr(T x, T y) { return x >> y; }
+};
+}
+
+template <typename T, size_t N>
+struct alignas(next_poweroftwo(N * sizeof(T))) simd
+{
+    using ops =
+        conditional<std::is_floating_point<T>::value, internal::simd_float_ops<T>, internal::simd_int_ops<T>>;
+    constexpr static simd broadcast(T value) { return broadcast_impl(value, csizeseq<N>); }
+    constexpr friend simd operator+(const simd& x) { return x; }
+    constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, csizeseq<N>); }
+    constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, csizeseq<N>); }
+
+    constexpr friend simd operator+(const simd& x, const simd& y)
+    {
+        return op_impl<ops::add>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator-(const simd& x, const simd& y)
+    {
+        return op_impl<ops::sub>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator*(const simd& x, const simd& y)
+    {
+        return op_impl<ops::mul>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator/(const simd& x, const simd& y)
+    {
+        return op_impl<ops::div>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator&(const simd& x, const simd& y)
+    {
+        return op_impl<ops::band>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator|(const simd& x, const simd& y)
+    {
+        return op_impl<ops::bor>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator^(const simd& x, const simd& y)
+    {
+        return op_impl<ops::bxor>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator<<(const simd& x, const simd& y)
+    {
+        return op_impl<ops::shl>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator>>(const simd& x, const simd& y)
+    {
+        return op_impl<ops::shr>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator==(const simd& x, const simd& y)
+    {
+        return op_impl<ops::eq>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator!=(const simd& x, const simd& y)
+    {
+        return op_impl<ops::ne>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator<(const simd& x, const simd& y)
+    {
+        return op_impl<ops::lt>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator>(const simd& x, const simd& y)
+    {
+        return op_impl<ops::gt>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator<=(const simd& x, const simd& y)
+    {
+        return op_impl<ops::le>(x, y, csizeseq<N>);
+    }
+    constexpr friend simd operator>=(const simd& x, const simd& y)
+    {
+        return op_impl<ops::ge>(x, y, csizeseq<N>);
+    }
+    constexpr T operator[](size_t index) const { return items[index]; }
+    T& operator[](size_t index) { return items[index]; }
+    T items[N];
+
+    template <typename U>
+    constexpr simd<U, N> cast() const
+    {
+        return cast_impl<U>(*this, csizeseq<N>);
+    }
+
+private:
+    template <typename U, size_t... indices>
+    constexpr static simd<U, N> cast_impl(const simd& x, csizes_t<indices...>)
+    {
+        return simd<U, N>{ static_cast<U>(x.items[indices])... };
+    }
+    template <T (*fn)(T), size_t... indices>
+    constexpr static simd op_impl(const simd& x, csizes_t<indices...>)
+    {
+        return simd{ fn(x.items[indices])... };
+    }
+    template <T (*fn)(T, T), size_t... indices>
+    constexpr static simd op_impl(const simd& x, const simd& y, csizes_t<indices...>)
+    {
+        return simd{ fn(x.items[indices], y.items[indices])... };
+    }
+    template <size_t... indices>
+    constexpr static simd broadcast_impl(T value, csizes_t<indices...>)
+    {
+        return simd{ ((void)indices, value)... };
+    }
+};
+
+template <typename To, typename From, size_t N>
+constexpr CMT_INLINE simd<To, N> simd_cast(const simd<From, N>& value) noexcept
+{
+    return value.template cast<To>();
+}
+
+template <typename T, size_t N, int... indices>
+constexpr CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const simd<T, N>& x, const simd<T, N>& y,
+                                                              cints_t<indices...>) noexcept
+{
+    return simd<T, sizeof...(indices)>{ (indices == -1 ? T()
+                                                       : ((indices >= N) ? y[indices - N] : x[indices]))... };
+}
+
+template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)>
+constexpr CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept
+{
+    union {
+        const simd<From, N> from;
+        const simd<To, Nout> to;
+    } u{ value };
+    return u.to;
+}
+
+template <size_t N, typename T>
+CMT_INLINE simd<T, N> simd_read_impl(const T* src, cfalse_t)
+{
+    simd<T, N> temp;
+    internal::builtin_memcpy(temp.items, src, N * sizeof(T));
+    return temp;
+}
+template <size_t N, typename T>
+CMT_INLINE simd<T, N> simd_read_impl(const T* src, ctrue_t)
+{
+    return *ptr_cast<simd<T, N>>(src);
+}
+
+template <size_t N, typename T>
+CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, cfalse_t)
+{
+    internal::builtin_memcpy(dest, value.items, N * sizeof(T));
+}
+template <size_t N, typename T>
+CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, ctrue_t)
+{
+    *ptr_cast<simd<T, N>>(dest) = value;
+}
+
+template <size_t N, bool A = false, typename T>
+CMT_INLINE simd<T, N> simd_read(const T* src)
+{
+    return simd_read_impl<N>(src, cbool<A>);
+}
+
+template <bool A = false, size_t N, typename T>
+CMT_INLINE void simd_write(T* dest, const simd<T, N>& value)
+{
+    return simd_write_impl<N>(dest, value, cbool<A>);
+}
+
+#define KFR_SIMD_CAST(T, N, X) (::kfr::simd_cast<T>(X))
+#define KFR_SIMD_BITCAST(T, N, X) (::kfr::simd_bitcast<T>(X))
+#define KFR_SIMD_BROADCAST(T, N, X) (::kfr::simd<T, N>::broadcast(X))
+#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints<__VA_ARGS__>)
+
+#endif
 }
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -138,7 +138,7 @@ KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N
     return formula;
 }
 
-template <typename T, size_t N, typename = u8[N > 1]>
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
 KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
 {
     vec<itype<T>, N> quadrant;
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -33,7 +33,7 @@ namespace kfr
 namespace intrinsics
 {
 
-#if defined CMT_ARCH_SSE2
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
 
 KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); }
 KFR_SINTRIN f64x1 sqrt(const f64x1& x)
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -436,21 +436,21 @@ template <typename From, size_t N, typename To = utype<From>,
           size_t Nout = size_of<From>() * N / size_of<To>()>
 constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return KFR_SIMD_BITCAST(To, Nout, *value);
 }
 
 template <typename From, size_t N, typename To = itype<From>,
           size_t Nout = size_of<From>() * N / size_of<To>()>
 constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return KFR_SIMD_BITCAST(To, Nout, *value);
 }
 
 template <typename From, size_t N, typename To = ftype<From>,
           size_t Nout = size_of<From>() * N / size_of<To>()>
 constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
 {
-    return reinterpret_cast<simd<To, Nout>>(*value);
+    return KFR_SIMD_BITCAST(To, Nout, *value);
 }
 
 constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
@@ -1259,7 +1259,7 @@ CMT_INLINE vec<T, N> tovec(const mask<T, N>& x)
     return *x;
 }
 
-#if defined CMT_ARCH_SSE2 && defined CMT_COMPILER_GNU
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_SIMD
 CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); }
 CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); }
 #endif

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	include/kfr/base/abs.hpp	\|	4	++--
M	include/kfr/base/logical.hpp	\|	4	++--
M	include/kfr/base/min_max.hpp	\|	4	++--
M	include/kfr/base/round.hpp	\|	2	+-
M	include/kfr/base/saturation.hpp	\|	4	++--
M	include/kfr/base/select.hpp	\|	4	++--
M	include/kfr/base/simd.hpp	\|	212	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	include/kfr/base/sin_cos.hpp	\|	2	+-
M	include/kfr/base/sqrt.hpp	\|	2	+-
M	include/kfr/base/vec.hpp	\|	8	++++----