kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit c9da5fa2939fce901f20629aececfc967673e17c
parent d66dc65dbf6eec4aa56c9ae03e31c915072eb5ed
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Sat,  8 Oct 2016 04:40:58 +0300

Add generic compiler-independent SIMD functions

Diffstat:
Minclude/kfr/base/abs.hpp | 4++--
Minclude/kfr/base/logical.hpp | 4++--
Minclude/kfr/base/min_max.hpp | 4++--
Minclude/kfr/base/round.hpp | 2+-
Minclude/kfr/base/saturation.hpp | 4++--
Minclude/kfr/base/select.hpp | 4++--
Minclude/kfr/base/simd.hpp | 212++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Minclude/kfr/base/sin_cos.hpp | 2+-
Minclude/kfr/base/sqrt.hpp | 2+-
Minclude/kfr/base/vec.hpp | 8++++----
10 files changed, 228 insertions(+), 18 deletions(-)

diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp @@ -35,7 +35,7 @@ namespace kfr namespace intrinsics { -#if defined CMT_ARCH_SSSE3 +#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS // floating point template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> @@ -66,7 +66,7 @@ KFR_SINTRIN u8avx abs(const u8avx& x) { return x; } KFR_HANDLE_ALL_SIZES_NOT_F_1(abs) -#elif defined CMT_ARCH_NEON +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); } KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); } diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -49,7 +49,7 @@ struct bitmask type value; }; -#if defined CMT_ARCH_SSE2 +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS #if defined CMT_ARCH_SSE41 @@ -161,7 +161,7 @@ KFR_SINTRIN bool bittestany(const vec<T, N>& a) return bittestany(low(a)) || bittestany(high(a)); } -#elif CMT_ARCH_NEON +#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN bool bittestall(const u32neon& a) { diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp @@ -36,7 +36,7 @@ namespace kfr namespace intrinsics { -#if defined CMT_ARCH_SSE2 +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); } KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); } @@ -106,7 +106,7 @@ KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, KFR_HANDLE_ALL_SIZES_2(min) KFR_HANDLE_ALL_SIZES_2(max) -#elif defined CMT_ARCH_NEON +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); } KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); } diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp @@ -54,7 +54,7 @@ namespace intrinsics #define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TRUNC) #define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_NINT) -#if defined CMT_ARCH_SSE41 +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); } KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); } diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp @@ -71,7 +71,7 @@ KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N> return select(a < b, zerovector(a), a - b); } -#if defined CMT_ARCH_SSE2 +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); } KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); } @@ -108,7 +108,7 @@ KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs KFR_HANDLE_ALL_SIZES_2(satadd) KFR_HANDLE_ALL_SIZES_2(satsub) -#elif defined CMT_ARCH_NEON +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); } KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); } diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -32,7 +32,7 @@ namespace kfr namespace intrinsics { -#if defined CMT_ARCH_SSE41 +#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y) { @@ -132,7 +132,7 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec< return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c))); } -#elif defined CMT_ARCH_NEON +#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y) { diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp @@ -35,6 +35,7 @@ constexpr size_t index_undefined = static_cast<size_t>(-1); #ifdef CMT_COMPILER_CLANG #define KFR_NATIVE_SIMD 1 +#define KFR_NATIVE_INTRINSICS 1 #endif #ifdef KFR_NATIVE_SIMD @@ -100,6 +101,215 @@ CMT_INLINE void simd_write(T* dest, const simd<T, N>& value) #define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X)) #define KFR_SIMD_SHUFFLE(X, Y, ...) __builtin_shufflevector(X, Y, __VA_ARGS__) -#endif +#else + +namespace internal +{ + +template <typename T> +struct simd_float_ops +{ + constexpr static T neg(T x) { return -x; } + constexpr static T bnot(T x) { return ~x; } + + constexpr static T add(T x, T y) { return x + y; } + constexpr static T sub(T x, T y) { return x - y; } + constexpr static T mul(T x, T y) { return x * y; } + constexpr static T div(T x, T y) { return x / y; } + constexpr static T rem(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); } + constexpr static T band(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); } + constexpr static T bor(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); } + constexpr static T bxor(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); } + constexpr static T shl(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); } + constexpr static T shr(T x, T y) { return std::numeric_limits<T>::quiet_NaN(); } + + constexpr static T eq(T x, T y) { return maskbits<T>(x == y); } + constexpr static T ne(T x, T y) { return maskbits<T>(x != y); } + constexpr static T lt(T x, T y) { return maskbits<T>(x < y); } + constexpr static T gt(T x, T y) { return maskbits<T>(x > y); } + constexpr static T le(T x, T y) { return maskbits<T>(x <= y); } + constexpr static T ge(T x, T y) { return maskbits<T>(x >= y); } +}; +template <typename T> +struct simd_int_ops : simd_float_ops<T> +{ + constexpr static T rem(T x, T y) { return x % y; } + constexpr static T band(T x, T y) { return x & y; } + constexpr static T bor(T x, T y) { return x | y; } + constexpr static T bxor(T x, T y) { return x ^ y; } + constexpr static T shl(T x, T y) { return x << y; } + constexpr static T shr(T x, T y) { return x >> y; } +}; +} + +template <typename T, size_t N> +struct alignas(next_poweroftwo(N * sizeof(T))) simd +{ + using ops = + conditional<std::is_floating_point<T>::value, internal::simd_float_ops<T>, internal::simd_int_ops<T>>; + constexpr static simd broadcast(T value) { return broadcast_impl(value, csizeseq<N>); } + constexpr friend simd operator+(const simd& x) { return x; } + constexpr friend simd operator-(const simd& x) { return op_impl<ops::neg>(x, csizeseq<N>); } + constexpr friend simd operator~(const simd& x) { return op_impl<ops::bnot>(x, csizeseq<N>); } + + constexpr friend simd operator+(const simd& x, const simd& y) + { + return op_impl<ops::add>(x, y, csizeseq<N>); + } + constexpr friend simd operator-(const simd& x, const simd& y) + { + return op_impl<ops::sub>(x, y, csizeseq<N>); + } + constexpr friend simd operator*(const simd& x, const simd& y) + { + return op_impl<ops::mul>(x, y, csizeseq<N>); + } + constexpr friend simd operator/(const simd& x, const simd& y) + { + return op_impl<ops::div>(x, y, csizeseq<N>); + } + constexpr friend simd operator&(const simd& x, const simd& y) + { + return op_impl<ops::band>(x, y, csizeseq<N>); + } + constexpr friend simd operator|(const simd& x, const simd& y) + { + return op_impl<ops::bor>(x, y, csizeseq<N>); + } + constexpr friend simd operator^(const simd& x, const simd& y) + { + return op_impl<ops::bxor>(x, y, csizeseq<N>); + } + constexpr friend simd operator<<(const simd& x, const simd& y) + { + return op_impl<ops::shl>(x, y, csizeseq<N>); + } + constexpr friend simd operator>>(const simd& x, const simd& y) + { + return op_impl<ops::shr>(x, y, csizeseq<N>); + } + constexpr friend simd operator==(const simd& x, const simd& y) + { + return op_impl<ops::eq>(x, y, csizeseq<N>); + } + constexpr friend simd operator!=(const simd& x, const simd& y) + { + return op_impl<ops::ne>(x, y, csizeseq<N>); + } + constexpr friend simd operator<(const simd& x, const simd& y) + { + return op_impl<ops::lt>(x, y, csizeseq<N>); + } + constexpr friend simd operator>(const simd& x, const simd& y) + { + return op_impl<ops::gt>(x, y, csizeseq<N>); + } + constexpr friend simd operator<=(const simd& x, const simd& y) + { + return op_impl<ops::le>(x, y, csizeseq<N>); + } + constexpr friend simd operator>=(const simd& x, const simd& y) + { + return op_impl<ops::ge>(x, y, csizeseq<N>); + } + constexpr T operator[](size_t index) const { return items[index]; } + T& operator[](size_t index) { return items[index]; } + T items[N]; + + template <typename U> + constexpr simd<U, N> cast() const + { + return cast_impl<U>(*this, csizeseq<N>); + } + +private: + template <typename U, size_t... indices> + constexpr static simd<U, N> cast_impl(const simd& x, csizes_t<indices...>) + { + return simd<U, N>{ static_cast<U>(x.items[indices])... }; + } + template <T (*fn)(T), size_t... indices> + constexpr static simd op_impl(const simd& x, csizes_t<indices...>) + { + return simd{ fn(x.items[indices])... }; + } + template <T (*fn)(T, T), size_t... indices> + constexpr static simd op_impl(const simd& x, const simd& y, csizes_t<indices...>) + { + return simd{ fn(x.items[indices], y.items[indices])... }; + } + template <size_t... indices> + constexpr static simd broadcast_impl(T value, csizes_t<indices...>) + { + return simd{ ((void)indices, value)... }; + } +}; + +template <typename To, typename From, size_t N> +constexpr CMT_INLINE simd<To, N> simd_cast(const simd<From, N>& value) noexcept +{ + return value.template cast<To>(); +} + +template <typename T, size_t N, int... indices> +constexpr CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const simd<T, N>& x, const simd<T, N>& y, + cints_t<indices...>) noexcept +{ + return simd<T, sizeof...(indices)>{ (indices == -1 ? T() + : ((indices >= N) ? y[indices - N] : x[indices]))... }; +} + +template <typename To, typename From, size_t N, size_t Nout = N * sizeof(From) / sizeof(To)> +constexpr CMT_INLINE simd<To, Nout> simd_bitcast(const simd<From, N>& value) noexcept +{ + union { + const simd<From, N> from; + const simd<To, Nout> to; + } u{ value }; + return u.to; +} + +template <size_t N, typename T> +CMT_INLINE simd<T, N> simd_read_impl(const T* src, cfalse_t) +{ + simd<T, N> temp; + internal::builtin_memcpy(temp.items, src, N * sizeof(T)); + return temp; +} +template <size_t N, typename T> +CMT_INLINE simd<T, N> simd_read_impl(const T* src, ctrue_t) +{ + return *ptr_cast<simd<T, N>>(src); +} + +template <size_t N, typename T> +CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, cfalse_t) +{ + internal::builtin_memcpy(dest, value.items, N * sizeof(T)); +} +template <size_t N, typename T> +CMT_INLINE void simd_write_impl(T* dest, const simd<T, N>& value, ctrue_t) +{ + *ptr_cast<simd<T, N>>(dest) = value; +} + +template <size_t N, bool A = false, typename T> +CMT_INLINE simd<T, N> simd_read(const T* src) +{ + return simd_read_impl<N>(src, cbool<A>); +} + +template <bool A = false, size_t N, typename T> +CMT_INLINE void simd_write(T* dest, const simd<T, N>& value) +{ + return simd_write_impl<N>(dest, value, cbool<A>); +} + +#define KFR_SIMD_CAST(T, N, X) (::kfr::simd_cast<T>(X)) +#define KFR_SIMD_BITCAST(T, N, X) (::kfr::simd_bitcast<T>(X)) +#define KFR_SIMD_BROADCAST(T, N, X) (::kfr::simd<T, N>::broadcast(X)) +#define KFR_SIMD_SHUFFLE(X, Y, ...) simd_shuffle(X, Y, cints<__VA_ARGS__>) + +#endif } diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp @@ -138,7 +138,7 @@ KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N return formula; } -template <typename T, size_t N, typename = u8[N > 1]> +template <typename T, size_t N, KFR_ENABLE_IF(N > 1)> KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask) { vec<itype<T>, N> quadrant; diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp @@ -33,7 +33,7 @@ namespace kfr namespace intrinsics { -#if defined CMT_ARCH_SSE2 +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); } KFR_SINTRIN f64x1 sqrt(const f64x1& x) diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp @@ -436,21 +436,21 @@ template <typename From, size_t N, typename To = utype<From>, size_t Nout = size_of<From>() * N / size_of<To>()> constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept { - return reinterpret_cast<simd<To, Nout>>(*value); + return KFR_SIMD_BITCAST(To, Nout, *value); } template <typename From, size_t N, typename To = itype<From>, size_t Nout = size_of<From>() * N / size_of<To>()> constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept { - return reinterpret_cast<simd<To, Nout>>(*value); + return KFR_SIMD_BITCAST(To, Nout, *value); } template <typename From, size_t N, typename To = ftype<From>, size_t Nout = size_of<From>() * N / size_of<To>()> constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept { - return reinterpret_cast<simd<To, Nout>>(*value); + return KFR_SIMD_BITCAST(To, Nout, *value); } constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); } @@ -1259,7 +1259,7 @@ CMT_INLINE vec<T, N> tovec(const mask<T, N>& x) return *x; } -#if defined CMT_ARCH_SSE2 && defined CMT_COMPILER_GNU +#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_SIMD CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); } CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); } #endif