kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 724eb3ba4065f91320539110970f069cf966f0e2
parent 966319a1104414ccce9ffc65632ffb38c87f479a
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 22 Nov 2022 19:28:16 +0000

Revert "Generic backend refactoring" because of ICE in MSVC

This reverts commit 966319a1104414ccce9ffc65632ffb38c87f479a.

Diffstat:
MCMakeLists.txt | 4+---
Minclude/kfr/base/tensor.hpp | 15---------------
Minclude/kfr/cident.h | 2+-
Minclude/kfr/cometa.hpp | 7-------
Minclude/kfr/dft/impl/bitrev.hpp | 4++--
Minclude/kfr/dft/impl/convolution-impl.cpp | 4++--
Minclude/kfr/dft/impl/ft.hpp | 18++++++++----------
Minclude/kfr/simd/complex.hpp | 7++++---
Minclude/kfr/simd/impl/backend_clang.hpp | 64+++++++++++++++++++++++++++++++++++-----------------------------
Minclude/kfr/simd/impl/backend_generic.hpp | 1389++++++++++++++++++++++++++++++++++++-------------------------------------------
Minclude/kfr/simd/impl/basicoperators_clang.hpp | 2+-
Minclude/kfr/simd/impl/select.hpp | 9+++++----
Minclude/kfr/simd/impl/simd.hpp | 25+++++++++----------------
Minclude/kfr/simd/shuffle.hpp | 24++++++++++++++++++++++--
Minclude/kfr/simd/vec.hpp | 28++++++++++++++--------------
Minclude/kfr/testo/comparison.hpp | 3---
Mtests/base_test.cpp | 3---
Mtests/dft_test.cpp | 158+++++++++++++++++++++++++++++++++++++++----------------------------------------
Mtests/unit/dsp/window.cpp | 6------
19 files changed, 823 insertions(+), 949 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -210,9 +210,7 @@ endfunction () if (KFR_ENABLE_DFT) - if (CLANG) - set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast") - endif () + set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast") if (KFR_ENABLE_DFT_MULTIARCH) add_library(kfr_dft INTERFACE) diff --git a/include/kfr/base/tensor.hpp b/include/kfr/base/tensor.hpp @@ -312,25 +312,10 @@ public: }; } -#if defined(CMT_COMPILER_IS_MSVC) - tensor(const tensor& other) - : m_data(other.m_data), m_size(other.m_size), m_is_contiguous(other.m_is_contiguous), - m_shape(other.m_shape), m_strides(other.m_strides), m_finalizer(other.m_finalizer) - { - } - tensor(tensor&& other) - : m_data(other.m_data), m_size(other.m_size), m_is_contiguous(other.m_is_contiguous), - m_shape(other.m_shape), m_strides(other.m_strides), m_finalizer(std::move(other.m_finalizer)) - { - } - tensor(tensor& other) : tensor(const_cast<const tensor&>(other)) {} - tensor(const tensor&& other) : tensor(static_cast<const tensor&>(other)) {} -#else tensor(const tensor&) = default; tensor(tensor&&) = default; tensor(tensor& other) : tensor(const_cast<const tensor&>(other)) {} tensor(const tensor&& other) : tensor(static_cast<const tensor&>(other)) {} -#endif #if defined(CMT_COMPILER_IS_MSVC) tensor& operator=(const tensor& src) & diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -411,7 +411,7 @@ extern char* gets(char* __s); #define CMT_NODEBUG #define CMT_INLINE inline CMT_INLINE_IN_RELEASE #define CMT_INLINE_MEMBER CMT_INLINE_IN_RELEASE -#if _MSC_VER >= 1927 && _MSVC_LANG >= 202002L +#if _MSC_VER >= 1927 #define CMT_INLINE_LAMBDA [[msvc::forceinline]] #else #define CMT_INLINE_LAMBDA diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp @@ -454,13 +454,6 @@ struct cvals_t<T> static CMT_MEM_INTRINSIC const T* array() { return nullptr; } }; -template <typename T, bool... flags, T... values1, T... values2> -constexpr cvals_t<T, (flags ? values1 : values2)...> select(cvals_t<bool, flags...>, cvals_t<T, values1...>, - cvals_t<T, values2...>) -{ - return {}; -} - namespace details { template <size_t index, typename T, T... vals> diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp @@ -273,12 +273,12 @@ KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, template <typename T> KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) { - const size_t N = size_t(1) << log2n; + const size_t N = 1 << log2n; const size_t N4 = N / 4; const size_t iend = N / 16 * 4 * 2; constexpr size_t istep = 2 * 4; const size_t jstep1 = (1 << (log2n - 5)) * 4 * 2; - const size_t jstep2 = size_t(size_t(1) << (log2n - 5)) * 4 * 2 - size_t(size_t(1) << (log2n - 6)) * 4 * 2; + const size_t jstep2 = size_t(1 << (log2n - 5)) * 4 * 2 - size_t(1 << (log2n - 6)) * 4 * 2; T* io = ptr_cast<T>(inout); for (size_t i = 0; i < iend;) diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp @@ -51,7 +51,7 @@ univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<co dft->execute(src2padded, src2padded, temp); src1padded = src1padded * src2padded; dft->execute(src1padded, src1padded, temp, true); - const ST invsize = reciprocal<ST>(static_cast<ST>(size)); + const ST invsize = reciprocal<ST>(size); return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; } @@ -70,7 +70,7 @@ univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<c dft->execute(src2padded, src2padded, temp); src1padded = src1padded * src2padded; dft->execute(src1padded, src1padded, temp, true); - const ST invsize = reciprocal<ST>(static_cast<ST>(size)); + const ST invsize = reciprocal<ST>(size); return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; } diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp @@ -189,9 +189,8 @@ KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value) v.write(ptr_cast<T>(dest), cbool_t<A>()); } -#ifdef CMT_COPMILER_CLANG template <> -KFR_INTRINSIC cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* src) +inline cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* src) { const cvec<f32, 4> l = concat(cread<2>(src), cread<2>(src + 4)); const cvec<f32, 4> h = concat(cread<2>(src + 2), cread<2>(src + 6)); @@ -199,7 +198,7 @@ KFR_INTRINSIC cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); } template <> -KFR_INTRINSIC cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* src) +inline cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* src) { const cvec<f32, 4> l = concat(cread<2, true>(src), cread<2, true>(src + 4)); const cvec<f32, 4> h = concat(cread<2, true>(src + 2), cread<2, true>(src + 6)); @@ -208,7 +207,7 @@ KFR_INTRINSIC cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* s } template <> -KFR_INTRINSIC cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src) +inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src) { const cvec<f64, 2> l = concat(cread<1>(src), cread<1>(src + 2)); const cvec<f64, 2> h = concat(cread<1>(src + 1), cread<1>(src + 3)); @@ -217,7 +216,7 @@ KFR_INTRINSIC cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* } template <> -KFR_INTRINSIC void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) +inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) { const cvec<f32, 8> xx = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); @@ -230,7 +229,7 @@ KFR_INTRINSIC void cwrite_split<8, false, true, f32>(complex<f32>* dest, const c cwrite<2>(dest + 6, d); } template <> -KFR_INTRINSIC void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) +inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) { const cvec<f32, 8> xx = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); @@ -244,7 +243,7 @@ KFR_INTRINSIC void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cv } template <> -KFR_INTRINSIC void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) +inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) { const cvec<f64, 4> xx = concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); @@ -254,7 +253,7 @@ KFR_INTRINSIC void cwrite_split<4, false, true, f64>(complex<f64>* dest, const c cwrite<1>(dest + 3, part<4, 3>(xx)); } template <> -KFR_INTRINSIC void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) +inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) { const cvec<f64, 4> xx = concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); @@ -263,7 +262,6 @@ KFR_INTRINSIC void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cv cwrite<1, true>(dest + 1, part<4, 2>(xx)); cwrite<1, true>(dest + 3, part<4, 3>(xx)); } -#endif template <size_t N, size_t stride, typename T, size_t... Indices> KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>) @@ -1775,7 +1773,7 @@ KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src) return digitreverse4<2>(cread<N, A>(src)); } -#ifdef CMT_COMPILER_CLANG +#if 1 template <> KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src) diff --git a/include/kfr/simd/complex.hpp b/include/kfr/simd/complex.hpp @@ -220,15 +220,16 @@ template <typename T, size_t N, size_t... indices> KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevector(const vec<complex<T>, N>& x, csizes_t<indices...>) CMT_NOEXCEPT { - return intrinsics::simd_shuffle(intrinsics::simd_tag_v<unwrap_bit<T>, N>, scale<2, indices...>(), x.v); + return intrinsics::simd_shuffle(intrinsics::simd_t<unwrap_bit<T>, N>{}, x.v, scale<2, indices...>(), + overload_auto); } template <typename T, size_t N, size_t... indices> KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevectors(const vec<complex<T>, N>& x, const vec<T, N>& y, csizes_t<indices...>) CMT_NOEXCEPT { - return intrinsics::simd_shuffle(intrinsics::simd_tag_v<unwrap_bit<T>, N, N>, x.v, y.v, - scale<2, indices...>()); + return intrinsics::simd_shuffle(intrinsics::simd2_t<unwrap_bit<T>, N, N>{}, x.v, y.v, + scale<2, indices...>(), overload_auto); } namespace internal { diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp @@ -39,11 +39,11 @@ template <typename TT, size_t NN> using simd = unwrap_bit<TT> __attribute__((ext_vector_type(NN))); template <typename T, size_t N1> -KFR_INTRINSIC simd<T, N1> simd_concat(simd_tag<T, N1>, const simd<T, N1>& x); +KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x); template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)> -KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(simd_tag<T, N1, N2, Ns...>, const simd<T, N1>& x, - const simd<T, N2>& y, const simd<T, Ns>&... z); +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, Ns>&... z); template <typename Tout> KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete; @@ -84,13 +84,13 @@ KFR_INTRINSIC simd<Tout, N> simd_allones() /// @brief Converts input vector to vector with subtype Tout template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout))> -KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) +KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) { return (simd<Tout, Nout>)x; } template <typename T, size_t N> -KFR_INTRINSIC simd<T, N> simd_bitcast(simd_cvt_tag<T, T, N>, const simd<T, N>& x) +KFR_INTRINSIC simd<T, N> simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x) { return x; } @@ -109,59 +109,65 @@ KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) } template <typename T, size_t N> -KFR_INTRINSIC simd<T, N> simd_broadcast(simd_tag<T, N>, identity<T> value) +KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) { return static_cast<unwrap_bit<T>>(value); } template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> -KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_tag<T, N>, csizes_t<indices...>, const simd<T, N>& x) +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>, + overload_generic) { return __builtin_shufflevector(x, x, (indices > N ? -1 : static_cast<int>(indices))...); } -template <typename T, size_t N1, size_t N2, size_t... indices, size_t Nout = sizeof...(indices)> -KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_tag<T, N1, N2>, csizes_t<indices...>, const simd<T, N1>& x, - const simd<T, N2>& y) +template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y, + csizes_t<indices...>, overload_generic) { - if constexpr (N1 == N2) - return __builtin_shufflevector(x, y, (indices > 2 * N1 ? -1 : static_cast<int>(indices))...); - else - { - constexpr size_t Nmax = (N1 > N2 ? N1 : N2); - return simd_shuffle(simd_tag_v<T, Nmax, Nmax>, - csizes<(indices < N1 ? indices - : indices < N1 + N2 ? indices + (Nmax - N1) - : index_undefined)...>, - simd_shuffle(simd_tag_v<T, N1>, csizeseq<Nmax>, x), - simd_shuffle(simd_tag_v<T, N2>, csizeseq<Nmax>, y)); - } + static_assert(N == N2, ""); + return __builtin_shufflevector(x, y, (indices > 2 * N ? -1 : static_cast<int>(indices))...); +} + +template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2), + size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y, + csizes_t<indices...>, overload_generic) +{ + constexpr size_t Nmax = (N1 > N2 ? N1 : N2); + return simd_shuffle(simd2_t<T, Nmax, Nmax>{}, + simd_shuffle(simd_t<T, N1>{}, x, csizeseq<Nmax>, overload_auto), + simd_shuffle(simd_t<T, N2>{}, y, csizeseq<Nmax>, overload_auto), + csizes<(indices < N1 ? indices + : indices < N1 + N2 ? indices + (Nmax - N1) + : index_undefined)...>, + overload_auto); } template <typename T, size_t N1> -KFR_INTRINSIC simd<T, N1> simd_concat(simd_tag<T, N1>, const simd<T, N1>& x) +KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x) { return x; } template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/> -KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(simd_tag<T, N1, N2, Ns...>, const simd<T, N1>& x, - const simd<T, N2>& y, const simd<T, Ns>&... z) +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, Ns>&... z) { - return simd_shuffle(simd_tag_v<T, N1, N2 + Nscount>, csizeseq<N1 + N2 + Nscount>, x, - simd_concat(simd_tag_v<T, N2, Ns...>, y, z...)); + return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...), + csizeseq<N1 + N2 + Nscount>, overload_auto); } /// @brief Converts input vector to vector with subtype Tout template <typename Tout, typename Tin, size_t N> -KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) +KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) { return __builtin_convertvector(x, simd<Tout, N>); } /// @brief Converts input vector to vector with subtype Tout template <typename T, size_t N> -KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_tag<T, T, N>, const simd<T, N>& x) +KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) { return x; } diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp @@ -32,11 +32,6 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized") namespace kfr { - -constexpr inline size_t operand_undefined = SIZE_MAX; - -constexpr size_t flush_op(size_t op) { return op == operand_undefined ? 0 : op; } - template <size_t bits, size_t...> struct shuffle_mask; @@ -79,8 +74,6 @@ CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT; #define not_optimized(...) CMT_NOOP #endif -#define NOT_OPTIMIZED not_optimized("OPTOPP:" CMT_FUNC_SIGNATURE) - inline namespace CMT_ARCH_NAME { @@ -281,41 +274,20 @@ KFR_SIMD_TYPE(i32, 16, __m512i) KFR_SIMD_TYPE(i64, 8, __m512i) #endif // CMT_ARCH_AVX512 -#if defined CMT_COMPILER_IS_MSVC && defined CMT_ARCH_X32 -KFR_INTRINSIC __m128i _mm_cvtsi64_si128(int64_t u) -{ - __m128i r = _mm_setzero_si128(); - r.m128i_i64[0] = u; - return r; -} -KFR_INTRINSIC int64_t _mm_cvtsi128_si64(const __m128i& i) { return i.m128i_i64[0]; } -KFR_INTRINSIC int64_t _mm_cvttsd_si64(const __m128d& d) { return static_cast<int64_t>(d.m128d_f64[0]); } -KFR_INTRINSIC __m128d _mm_cvtsi64_sd(const __m128d& a, int64_t b) -{ - __m128d r = a; - r.m128d_f64[0] = static_cast<double>(b); - return r; -} -#endif - -KFR_INTRINSIC double take_hi_sd(__m128d x) { return _mm_cvtsd_f64(_mm_unpackhi_pd(x, x)); } - -KFR_INTRINSIC __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT -{ - return _mm_set_epi64x(q1, q0); -} -KFR_INTRINSIC __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT -{ - return _mm_set_epi32(q3, q2, q1, q0); -} - -#ifdef CMT_ARCH_AVX2 - -KFR_INTRINSIC __m256i KFR_mm_broadcastsi128_si256(const __m128i& x) -{ - return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); -} -#endif +#ifdef CMT_ARCH_NEON +KFR_SIMD_TYPE(u8, 16, uint8x16_t); +KFR_SIMD_TYPE(u16, 8, uint16x8_t); +KFR_SIMD_TYPE(u32, 4, uint32x4_t); +KFR_SIMD_TYPE(u64, 2, uint64x2_t); +KFR_SIMD_TYPE(i8, 16, int8x16_t); +KFR_SIMD_TYPE(i16, 8, int16x8_t); +KFR_SIMD_TYPE(i32, 4, int32x4_t); +KFR_SIMD_TYPE(i64, 2, int64x2_t); +KFR_SIMD_TYPE(f32, 4, float32x4_t); +#ifdef CMT_ARCH_NEON64 +KFR_SIMD_TYPE(f64, 2, float64x2_t); +#endif // CMT_ARCH_NEON64 +#endif // CMT_ARCH_NEON #if defined CMT_COMPILER_IS_MSVC #define KFR_i8sse_INDEX(x, i) x.m128i_i8[i] @@ -343,6 +315,9 @@ KFR_INTRINSIC __m256i KFR_mm_broadcastsi128_si256(const __m128i& x) // specializations +template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>); + #ifdef KFR_NATIVE_INTRINSICS #define KFR_GEN_ty(n, ty) ty(n) @@ -357,6 +332,16 @@ KFR_INTRINSIC __m256i KFR_mm_broadcastsi128_si256(const __m128i& x) #ifdef CMT_ARCH_SSE2 +KFR_INTRINSIC double take_hi_sd(__m128d x) { return _mm_cvtsd_f64(_mm_unpackhi_pd(x, x)); } + +KFR_INTRINSIC __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT +{ + return _mm_set_epi64x(q1, q0); +} +KFR_INTRINSIC __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT +{ + return _mm_set_epi32(q3, q2, q1, q0); +} KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x) KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x) KFR_INTRIN_MAKE(2, f64, _mm_setr_pd) @@ -369,7 +354,7 @@ KFR_INTRIN_MAKE(16, i8, _mm_setr_epi8) KFR_INTRIN_MAKE(16, u8, _mm_setr_epi8) #define KFR_INTRIN_BITCAST(Tout, Tin, N, ...) \ - KFR_INTRINSIC simd<Tout, N> simd_bitcast(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ + KFR_INTRINSIC simd<Tout, N> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ { \ return __VA_ARGS__; \ } @@ -379,7 +364,7 @@ KFR_INTRIN_BITCAST(f64, i64, 2, _mm_castsi128_pd(x)) KFR_INTRIN_BITCAST(i64, f64, 2, _mm_castpd_si128(x)) #define KFR_INTRIN_BROADCAST(T, N, ...) \ - KFR_INTRINSIC simd<T, N> simd_broadcast(simd_tag<T, N>, T value) CMT_NOEXCEPT { return __VA_ARGS__; } + KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, T value) CMT_NOEXCEPT { return __VA_ARGS__; } KFR_INTRIN_BROADCAST(i8, 16, _mm_set1_epi8(value)) KFR_INTRIN_BROADCAST(i16, 8, _mm_set1_epi16(value)) @@ -396,35 +381,69 @@ KFR_INTRIN_BROADCAST(i32, 2, simd<i32, 2>(value, value)) KFR_INTRIN_BROADCAST(u32, 2, simd<u32, 2>(value, value)) KFR_INTRIN_BROADCAST(f32, 2, simd<f32, 2>{ value, value }) +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<float, Nout> simd_shuffle(simd_t<float, N>, const simd<float, N>& x, + csizes_t<indices...> ind, overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<float, N>{}, x, ind); +} + +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<double, Nout> simd_shuffle(simd_t<double, N>, const simd<double, N>& x, + csizes_t<indices...> ind, overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<double, N>{}, x, ind); +} + +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<float, Nout> simd_shuffle(simd2_t<float, N, N>, const simd<float, N>& x, + const simd<float, N>& y, csizes_t<indices...> ind, + overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<float, 2 * N>{}, simd_from_halves(simd_t<float, 2 * N>{}, x, y), ind); +} + +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<double, Nout> simd_shuffle(simd2_t<double, N, N>, const simd<double, N>& x, + const simd<double, N>& y, csizes_t<indices...> ind, + overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<double, 2 * N>{}, simd_from_halves(simd_t<double, 2 * N>{}, x, y), ind); +} + +#define KFR_INTRIN_SHUFFLE_DUPHALVES(T, N, ...) \ + KFR_INTRINSIC simd<T, N * 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, \ + decltype(csizeseq<N * 2> % csize<N>), overload_priority<9>) \ + CMT_NOEXCEPT \ + { \ + return __VA_ARGS__; \ + } + #define KFR_INTRIN_SHUFFLE_SWAP(T, N, ...) \ - KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T, N>, decltype(csizeseq<N> ^ csize<1>), \ - csizeseq_t<N, 0, 0>, const simd<T, N>& x) CMT_NOEXCEPT \ + KFR_INTRINSIC simd<T, N> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, \ + decltype(csizeseq<N> ^ csize<1>), overload_priority<9>) \ + CMT_NOEXCEPT \ { \ return __VA_ARGS__; \ } -#define KFR_INTRIN_SHUFFLE_EXTEND(T, Nout, Nin, ...) \ - KFR_INTRINSIC simd<T, Nout> simd_shuf( \ - simd_tag<T, Nin>, concat_lists<csizeseq_t<Nin>, csizeseq_t<Nout - Nin, 0, 0>>, \ - concat_lists<csizeseq_t<Nin, 0, 0>, csizeseq_t<Nout - Nin, operand_undefined, 0>>, \ - const simd<T, Nin>& x) CMT_NOEXCEPT \ +#define KFR_INTRIN_SHUFFLE_LINEAR(T, Nout, Nin, ...) \ + KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizeseq_t<Nout>, \ + overload_priority<9>) CMT_NOEXCEPT \ { \ - static_assert(Nout > Nin); \ return __VA_ARGS__; \ } -#define KFR_INTRIN_SHUFFLE_SLICE(T, Nout, Nin, Nstart, ...) \ - KFR_INTRINSIC simd<T, Nout> simd_shuf(simd_tag<T, Nin>, csizeseq_t<Nout, Nstart>, \ - csizeseq_t<Nout, 0, 0>, const simd<T, Nin>& x) CMT_NOEXCEPT \ +#define KFR_INTRIN_SHUFFLE_LINEAR_START(T, Nout, Nin, Nstart, ...) \ + KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, \ + csizeseq_t<Nout, Nstart>, overload_priority<9>) CMT_NOEXCEPT \ { \ - static_assert(Nout < Nin); \ return __VA_ARGS__; \ } #define KFR_INTRIN_SHUFFLE_CONCAT(T, Nin, ...) \ - KFR_INTRINSIC simd<T, Nin + Nin> simd_shuf(simd_tag<T, Nin, Nin>, \ - concat_lists<csizeseq_t<Nin>, csizeseq_t<Nin>>, \ - concat_lists<csizeseq_t<Nin, 0, 0>, csizeseq_t<Nin, 1, 0>>, \ - const simd<T, Nin>& x, const simd<T, Nin>& y) CMT_NOEXCEPT \ + KFR_INTRINSIC simd<T, Nin + Nin> simd_shuffle(simd2_t<T, Nin, Nin>, const simd<T, Nin>& x, \ + const simd<T, Nin>& y, csizeseq_t<Nin + Nin>, \ + overload_priority<9>) CMT_NOEXCEPT \ { \ return __VA_ARGS__; \ } @@ -440,103 +459,121 @@ KFR_INTRIN_SHUFFLE_CONCAT(f32, 2, _mm_setr_ps(x.low, x.high, y.low, y.high)) KFR_INTRIN_SHUFFLE_SWAP(f32, 2, simd<f32, 2>(x.high, x.low)) #endif +#if defined CMT_COMPILER_IS_MSVC && defined CMT_ARCH_X32 +KFR_INTRINSIC __m128i _mm_cvtsi64_si128(int64_t u) +{ + __m128i r = _mm_setzero_si128(); + r.m128i_i64[0] = u; + return r; +} +KFR_INTRINSIC int64_t _mm_cvtsi128_si64(const __m128i& i) { return i.m128i_i64[0]; } +KFR_INTRINSIC int64_t _mm_cvttsd_si64(const __m128d& d) { return static_cast<int64_t>(d.m128d_f64[0]); } +KFR_INTRINSIC __m128d _mm_cvtsi64_sd(const __m128d& a, int64_t b) +{ + __m128d r = a; + r.m128d_f64[0] = static_cast<double>(b); + return r; +} +#endif + KFR_INTRIN_BITCAST(f32, i32, 1, _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(x)))) KFR_INTRIN_BITCAST(i32, f32, 1, _mm_cvtsi128_si32(_mm_castps_si128(_mm_set_ss(x)))) KFR_INTRIN_BITCAST(f64, i64, 1, _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64_si128(x)))) KFR_INTRIN_BITCAST(i64, f64, 1, _mm_cvtsi128_si64(_mm_castpd_si128(_mm_set_sd(x)))) -KFR_INTRINSIC simd<float, 2> simd_shuf(simd_tag<float, 4>, csizes_t<0, 0>, csizes_t<0, 0>, - const simd<float, 4>& x) noexcept -{ -#ifndef KFR_f32x2_array - return _mm_cvtsd_f64(_mm_castps_pd(_mm_unpacklo_ps(x, x))); -#else - float v = KFR_f32sse_INDEX(x, 0); - return simd<f32, 2>(v, v); +#ifndef CMT_ARCH_AVX +KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, simd<i8, 32>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, simd<u8, 32>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, simd<i16, 16>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, simd<u16, 16>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, simd<i32, 8>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, simd<u32, 8>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, simd<i64, 4>{ x, x }) +KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, simd<u64, 4>{ x, x }) #endif -} -// KFR_INTRINSIC simd<float, 2> simd_shuf(simd_tag<float, 4>, csizes_t<1, 1>, csizes_t<0, 0>, -// const simd<float, 4>& x) noexcept -// { -// #ifndef KFR_f32x2_array -// return _mm_cvtsd_f64(_mm_castps_pd(_mm_shuffle_ps(x, x, (shuffle_mask<8, 1, 1, 1, 1>::value)))); -// #else -// float v = KFR_f32sse_INDEX(x, 1); -// return simd<f32, 2>(v, v); -// #endif -// } // extend -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16, 2, _mm_cvtsi32_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16, 2, _mm_cvtsi32_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16, 4, _mm_cvtsi32_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16, 4, _mm_cvtsi32_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16, 8, _mm_cvtsi64_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16, 8, _mm_cvtsi64_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8, 2, _mm_cvtsi32_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8, 2, _mm_cvtsi32_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8, 4, _mm_cvtsi64_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8, 4, _mm_cvtsi64_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(u32, 4, 2, _mm_cvtsi64_si128(x.whole)) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4, 2, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 1, _mm_cvtsi32_si128(u8(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 1, _mm_cvtsi32_si128(u16(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 1, _mm_cvtsi64_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 1, _mm_cvtsi32_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 1, _mm_cvtsi64_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 1, _mm_set_ss(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 1, _mm_set_sd(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 4, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 4, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 8, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 8, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 2, _mm_cvtsi32_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 4, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 4, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 2, _mm_cvtsi64_si128(x.whole)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 2, _mm_cvtsi64_si128(x.whole)) // slice -KFR_INTRIN_SHUFFLE_SLICE(i32, 1, 4, 0, _mm_cvtsi128_si32(x)) -KFR_INTRIN_SHUFFLE_SLICE(u32, 1, 4, 0, _mm_cvtsi128_si32(x)) -KFR_INTRIN_SHUFFLE_SLICE(i64, 1, 2, 0, _mm_cvtsi128_si64(x)) -KFR_INTRIN_SHUFFLE_SLICE(u64, 1, 2, 0, _mm_cvtsi128_si64(x)) -KFR_INTRIN_SHUFFLE_SLICE(f32, 1, 4, 0, _mm_cvtss_f32(x)) -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 4, 0, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 1, 4, _mm_cvtsi128_si32(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 1, 4, _mm_cvtsi128_si32(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x)))) #ifndef KFR_f32x2_array -KFR_INTRIN_SHUFFLE_EXTEND(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole))) #else -KFR_INTRIN_SHUFFLE_EXTEND(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high))) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high))) #endif -KFR_INTRIN_SHUFFLE_SLICE(f64, 1, 2, 0, _mm_cvtsd_f64(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x)) -KFR_INTRIN_SHUFFLE_SLICE(i8, 2, 16, 0, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x)))) -KFR_INTRIN_SHUFFLE_SLICE(i8, 4, 16, 0, simd<i8, 4>::from(_mm_cvtsi128_si32(x))) -KFR_INTRIN_SHUFFLE_SLICE(i8, 8, 16, 0, simd<i8, 8>::from(_mm_cvtsi128_si64(x))) -KFR_INTRIN_SHUFFLE_SLICE(u8, 2, 16, 0, simd<u8, 2>::from(u16(_mm_cvtsi128_si32(x)))) -KFR_INTRIN_SHUFFLE_SLICE(u8, 4, 16, 0, simd<u8, 4>::from(_mm_cvtsi128_si32(x))) -KFR_INTRIN_SHUFFLE_SLICE(u8, 8, 16, 0, simd<u8, 8>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 4, 16, simd<i8, 4>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 8, 16, simd<i8, 8>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 2, 16, simd<u8, 2>::from(u16(_mm_cvtsi128_si32(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 4, 16, simd<u8, 4>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 8, 16, simd<u8, 8>::from(_mm_cvtsi128_si64(x))) -KFR_INTRIN_SHUFFLE_SLICE(i16, 2, 8, 0, simd<i16, 2>::from(_mm_cvtsi128_si32(x))) -KFR_INTRIN_SHUFFLE_SLICE(i16, 4, 8, 0, simd<i16, 4>::from(_mm_cvtsi128_si64(x))) -KFR_INTRIN_SHUFFLE_SLICE(u16, 2, 8, 0, simd<u16, 2>::from(_mm_cvtsi128_si32(x))) -KFR_INTRIN_SHUFFLE_SLICE(u16, 4, 8, 0, simd<u16, 4>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 2, 8, simd<i16, 2>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 4, 8, simd<i16, 4>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 2, 8, simd<u16, 2>::from(_mm_cvtsi128_si32(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 4, 8, simd<u16, 4>::from(_mm_cvtsi128_si64(x))) -KFR_INTRIN_SHUFFLE_SLICE(i32, 2, 4, 0, simd<i32, 2>::from(_mm_cvtsi128_si64(x))) -KFR_INTRIN_SHUFFLE_SLICE(u32, 2, 4, 0, simd<u32, 2>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 2, 4, simd<i32, 2>::from(_mm_cvtsi128_si64(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 2, 4, simd<u32, 2>::from(_mm_cvtsi128_si64(x))) // high -KFR_INTRIN_SHUFFLE_SLICE(u8, 8, 16, 8, simd<u8, 8>::from(KFR_u64sse_INDEX(x, 1))) -KFR_INTRIN_SHUFFLE_SLICE(i8, 8, 16, 8, simd<i8, 8>::from(KFR_u64sse_INDEX(x, 1))) -KFR_INTRIN_SHUFFLE_SLICE(u16, 4, 8, 4, simd<u16, 4>::from(KFR_u64sse_INDEX(x, 1))) -KFR_INTRIN_SHUFFLE_SLICE(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDEX(x, 1))) -KFR_INTRIN_SHUFFLE_SLICE(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1))) -KFR_INTRIN_SHUFFLE_SLICE(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 8, 16, 8, simd<u8, 8>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 8, 16, 8, simd<i8, 8>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 4, 8, 4, simd<u16, 4>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1))) #ifndef KFR_f32x2_array -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 4, 2, simd<f32, 2>::from(take_hi_sd(_mm_castps_pd(x)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 4, 2, simd<f32, 2>::from(take_hi_sd(_mm_castps_pd(x)))) #else -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 4, 2, simd_halves<f32, 2>{ KFR_f32sse_INDEX(x, 2), KFR_f32sse_INDEX(x, 3) }) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 4, 2, + simd_halves<f32, 2>{ KFR_f32sse_INDEX(x, 2), KFR_f32sse_INDEX(x, 3) }) #endif #define KFR_INTRIN_CONVERT(Tout, Tin, N, ...) \ - KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ + KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ { \ return __VA_ARGS__; \ } #define KFR_INTRIN_CONVERT_NOOP_REF(Tout, Tin, N) \ - KFR_INTRINSIC const simd<Tout, N>& simd_convert(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) \ + KFR_INTRINSIC const simd<Tout, N>& simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) \ CMT_NOEXCEPT \ { \ return x; \ } #define KFR_INTRIN_CONVERT_NOOP(Tout, Tin, N) \ - KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ + KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ { \ return x; \ } @@ -631,26 +668,32 @@ KFR_INTRIN_BITCAST(f64, i64, 4, _mm256_castsi256_pd(x)) KFR_INTRIN_BITCAST(i64, f64, 4, _mm256_castpd_si256(x)) #ifndef CMT_ARCH_AVX512 -KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 16>, csizes_t<2, 3, 6, 7, 10, 11, 14, 15>, - csizeseq_t<8, 0, 0>, const simd<float, 16>& x) +KFR_INTRINSIC simd<float, 8> simd_shuffle(simd_t<float, 16>, const simd<float, 16>& x, + csizes_t<2, 3, 6, 7, 10, 11, 14, 15>, overload_priority<9>) { const __m256 t1 = _mm256_permute2f128_ps(x.low, x.high, (0 << 0) | (2 << 4)); const __m256 t2 = _mm256_permute2f128_ps(x.low, x.high, (1 << 0) | (3 << 4)); return _mm256_shuffle_ps(t1, t2, (shuffle_mask<8, 2, 3, 2, 3>::value)); } -KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 16>, csizes_t<0, 1, 4, 5, 8, 9, 12, 13>, - csizeseq_t<8, 0, 0>, const simd<float, 16>& x) + +KFR_INTRINSIC simd<float, 8> simd_shuffle(simd_t<float, 16>, const simd<float, 16>& x, + csizes_t<0, 1, 4, 5, 8, 9, 12, 13>, overload_priority<9>) { const __m256 t1 = _mm256_permute2f128_ps(x.low, x.high, (0 << 0) | (2 << 4)); const __m256 t2 = _mm256_permute2f128_ps(x.low, x.high, (1 << 0) | (3 << 4)); return _mm256_shuffle_ps(t1, t2, (shuffle_mask<8, 0, 1, 0, 1>::value)); } +#endif -KFR_INTRIN_SHUFFLE_SLICE(f64, 4, 8, 0, x.low) -KFR_INTRIN_SHUFFLE_SLICE(f64, 4, 8, 4, x.high) -KFR_INTRIN_SHUFFLE_SLICE(f32, 8, 16, 0, x.low) -KFR_INTRIN_SHUFFLE_SLICE(f32, 8, 16, 8, x.high) - +#ifndef CMT_ARCH_AVX2 +KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) #endif KFR_INTRINSIC __m256 KFR_mm256_setr_m128(__m128 x, __m128 y) @@ -672,6 +715,19 @@ KFR_INTRINSIC __m256i KFR_mm256_setr_m128i(__m128i x, __m128i y) #endif } +KFR_INTRIN_SHUFFLE_CONCAT(f32, 4, KFR_mm256_setr_m128(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(f64, 2, KFR_mm256_setr_m128d(x, y)) + +// concat +KFR_INTRIN_SHUFFLE_CONCAT(i8, 16, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i16, 8, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i32, 4, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(i64, 2, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u8, 16, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u16, 8, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u32, 4, KFR_mm256_setr_m128i(x, y)) +KFR_INTRIN_SHUFFLE_CONCAT(u64, 2, KFR_mm256_setr_m128i(x, y)) + #ifndef CMT_COMPILER_GCC // GCC bug workaround KFR_INTRIN_SHUFFLE_CONCAT(i8, 1, simd<i8, 2>(x, y)) @@ -685,15 +741,34 @@ KFR_INTRIN_SHUFFLE_CONCAT(f32, 1, simd<f32, 2>{ x, y }) KFR_INTRIN_SHUFFLE_CONCAT(f64, 1, _mm_setr_pd(x, y)) -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 8, 0, _mm_cvtsd_f64(_mm_castps_pd(_mm256_castps256_ps128(x)))) -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 8, 2, take_hi_sd(_mm_castps_pd(_mm256_castps256_ps128(x)))) -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 8, 4, _mm_cvtsd_f64(_mm_castps_pd(_mm256_extractf128_ps(x, 1)))) -KFR_INTRIN_SHUFFLE_SLICE(f32, 2, 8, 6, take_hi_sd(_mm_castps_pd(_mm256_extractf128_ps(x, 1)))) +KFR_INTRIN_SHUFFLE_DUPHALVES(f32, 4, KFR_mm256_setr_m128(x, x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(f64, 2, KFR_mm256_setr_m128d(x, x)) + +// low +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 8, _mm256_castps256_ps128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 4, _mm256_castpd256_pd128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 32, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 16, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 8, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 4, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 32, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 16, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 8, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 4, _mm256_castsi256_si128(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 8, _mm_cvtsd_f64(_mm_castps_pd(_mm256_castps256_ps128(x)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 2, take_hi_sd(_mm_castps_pd(_mm256_castps256_ps128(x)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 4, _mm_cvtsd_f64(_mm_castps_pd(_mm256_extractf128_ps(x, 1)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 6, take_hi_sd(_mm_castps_pd(_mm256_extractf128_ps(x, 1)))) // extend -KFR_INTRIN_SHUFFLE_EXTEND(f32, 4 * 2, 4, _mm256_castps128_ps256(x)) -KFR_INTRIN_SHUFFLE_EXTEND(f64, 2 * 2, 2, _mm256_castpd128_pd256(x)) -KFR_INTRIN_SHUFFLE_EXTEND(f32, 4 * 2, 2, _mm256_castps128_ps256(_mm_castpd_ps(_mm_set_sd(x.whole)))) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 4, _mm256_castps128_ps256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 2, _mm256_castpd128_pd256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 2, _mm256_castps128_ps256(_mm_castpd_ps(_mm_set_sd(x.whole)))) + +// high +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4, 8, 4, _mm256_extractf128_ps(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2, 4, 2, _mm256_extractf128_pd(x, 1)) #ifndef CMT_ARCH_AVX2 // high @@ -718,6 +793,8 @@ KFR_INTRIN_SHUFFLE_LINEAR_START(u64, 2, 4, 2, KFR_INTRIN_BROADCAST(f32, 8, _mm256_set1_ps(value)) KFR_INTRIN_BROADCAST(f64, 4, _mm256_set1_pd(value)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 8, 1, _mm256_castps128_ps256(_mm_set_ss(x))) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 4, 1, _mm256_castpd128_pd256(_mm_set_sd(x))) #endif // CMT_ARCH_AVX #ifdef CMT_ARCH_AVX2 @@ -750,6 +827,15 @@ KFR_INTRIN_CONVERT(f32, i16, 8, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(x))) KFR_INTRIN_CONVERT(f32, u8, 8, _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole)))) KFR_INTRIN_CONVERT(f32, u16, 8, _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(x))) +KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 16, 32, 16, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 8, 16, 8, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4, 8, 4, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2, 4, 2, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 16, 32, 16, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 8, 16, 8, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 4, 8, 4, _mm256_extracti128_si256(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(u64, 2, 4, 2, _mm256_extracti128_si256(x, 1)) + KFR_INTRIN_BROADCAST(i8, 32, _mm256_set1_epi8(value)) KFR_INTRIN_BROADCAST(i16, 16, _mm256_set1_epi16(value)) KFR_INTRIN_BROADCAST(i32, 8, _mm256_set1_epi32(value)) @@ -759,31 +845,45 @@ KFR_INTRIN_BROADCAST(u16, 16, _mm256_set1_epi16(value)) KFR_INTRIN_BROADCAST(u32, 8, _mm256_set1_epi32(value)) KFR_INTRIN_BROADCAST(u64, 4, _mm256_set1_epi64x(value)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 2, 16, _mm256_castsi128_si256(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 2, 8, _mm256_castsi128_si256(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 2, 4, _mm256_castsi128_si256(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i64, 2 * 2, 2, _mm256_castsi128_si256(x)) - -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u8(x)))) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u16(x)))) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(i64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRINSIC __m256i KFR_mm_broadcastsi128_si256(const __m128i& x) +{ + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, KFR_mm_broadcastsi128_si256(x)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, KFR_mm_broadcastsi128_si256(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 16, _mm256_castsi128_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 8, _mm256_castsi128_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 4, _mm256_castsi128_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 2, _mm256_castsi128_si256(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u8(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u16(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole))) KFR_INTRIN_CONVERT(i32, f32, 8, _mm256_cvttps_epi32(x)) KFR_INTRIN_CONVERT(f32, i32, 8, _mm256_cvtepi32_ps(x)) @@ -874,63 +974,64 @@ KFR_INTRIN_BROADCAST(u16, 32, _mm512_set1_epi16(value)) KFR_INTRIN_BROADCAST(u32, 16, _mm512_set1_epi32(value)) KFR_INTRIN_BROADCAST(u64, 8, _mm512_set1_epi64(value)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u8(x)))) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u16(x)))) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(i64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(u32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u8(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u16(x)))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole))) KFR_INTRIN_CONVERT(i32, f32, 16, _mm512_cvttps_epi32(x)) KFR_INTRIN_CONVERT(f32, i32, 16, _mm512_cvtepi32_ps(x)) KFR_INTRIN_CONVERT(f64, i32, 8, _mm512_cvtepi32_pd(x)) KFR_INTRIN_CONVERT(i32, f64, 8, _mm512_cvttpd_epi32(x)) -KFR_INTRIN_SHUFFLE_EXTEND(f32, 4 * 4, 4, _mm512_castps128_ps512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(f64, 2 * 4, 2, _mm512_castpd128_pd512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 4, 16, _mm512_castsi128_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 4, 8, _mm512_castsi128_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 4, 4, _mm512_castsi128_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i64, 2 * 4, 2, _mm512_castsi128_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(f32, 4 * 4, 2 * 4, _mm512_castps256_ps512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(f64, 2 * 4, 2 * 2, _mm512_castpd256_pd512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i8, 16 * 4, 2 * 16, _mm512_castsi256_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i16, 8 * 4, 2 * 8, _mm512_castsi256_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i32, 4 * 4, 2 * 4, _mm512_castsi256_si512(x)) -KFR_INTRIN_SHUFFLE_EXTEND(i64, 2 * 4, 2 * 2, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 4, _mm512_castps128_ps512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2, _mm512_castpd128_pd512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 16, _mm512_castsi128_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 8, _mm512_castsi128_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 4, _mm512_castsi128_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2, _mm512_castsi128_si512(x)) + +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 2 * 4, _mm512_castps256_ps512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2 * 2, _mm512_castpd256_pd512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2 * 16, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2 * 8, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2 * 4, _mm512_castsi256_si512(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2 * 2, _mm512_castsi256_si512(x)) // low -KFR_INTRIN_SHUFFLE_SLICE(f32, 4 * 2, 8 * 2, 0, _mm512_castps512_ps256(x)) -KFR_INTRIN_SHUFFLE_SLICE(f64, 2 * 2, 4 * 2, 0, _mm512_castpd512_pd256(x)) -KFR_INTRIN_SHUFFLE_SLICE(i8, 16 * 2, 32 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(i16, 8 * 2, 16 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(i32, 4 * 2, 8 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(i64, 2 * 2, 4 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(u8, 16 * 2, 32 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(u16, 8 * 2, 16 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(u32, 4 * 2, 8 * 2, 0, _mm512_castsi512_si256(x)) -KFR_INTRIN_SHUFFLE_SLICE(u64, 2 * 2, 4 * 2, 0, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 8 * 2, _mm512_castps512_ps256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 4 * 2, _mm512_castpd512_pd256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x)) // high -KFR_INTRIN_SHUFFLE_SLICE(f32, 4 * 2, 8 * 2, 4 * 2, _mm512_extractf32x8_ps(x, 1)) -KFR_INTRIN_SHUFFLE_SLICE(f64, 2 * 2, 4 * 2, 2 * 2, _mm512_extractf64x4_pd(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4 * 2, 8 * 2, 4 * 2, _mm512_extractf32x8_ps(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2 * 2, 4 * 2, 2 * 2, _mm512_extractf64x4_pd(x, 1)) -KFR_INTRIN_SHUFFLE_SLICE(i32, 4 * 2, 8 * 2, 4 * 2, _mm512_extracti32x8_epi32(x, 1)) -KFR_INTRIN_SHUFFLE_SLICE(i64, 2 * 2, 4 * 2, 2 * 2, _mm512_extracti64x4_epi64(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4 * 2, 8 * 2, 4 * 2, _mm512_extracti32x8_epi32(x, 1)) +KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2 * 2, 4 * 2, 2 * 2, _mm512_extracti64x4_epi64(x, 1)) // concat KFR_INTRIN_SHUFFLE_CONCAT(f32, 4 * 2, KFR_mm512_setr_m256(x, y)) @@ -951,11 +1052,10 @@ KFR_INTRIN_SHUFFLE_CONCAT(u64, 2 * 2, KFR_mm512_setr_m256i(x, y)) // generic functions template <typename T, size_t N1> -KFR_INTRINSIC const simd<T, N1>& simd_concat(simd_tag<T, N1>, const simd<T, N1>& x) CMT_NOEXCEPT; +KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT; template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)> -KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(simd_tag<T, N1, N2, Ns...>, const simd<T, N1>& x, - const simd<T, N2>& y, +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, const simd<T, Ns>&... z) CMT_NOEXCEPT; template <typename T, size_t N> @@ -1008,8 +1108,8 @@ template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABL KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args) CMT_NOEXCEPT { constexpr size_t Nlow = prev_poweroftwo(N - 1); - return simd_concat(simd_tag_v<Tout, Nlow, N - Nlow>, simd_make_helper<Tout>(csizeseq<Nlow>, args...), - simd_make_helper<Tout>(csizeseq<N - Nlow, Nlow>, args...)); + return simd_concat<Tout, Nlow, N - Nlow>(simd_make_helper<Tout>(csizeseq<Nlow>, args...), + simd_make_helper<Tout>(csizeseq<N - Nlow, Nlow>, args...)); } template <typename T, size_t... indices, typename... Args, size_t N> @@ -1023,7 +1123,7 @@ KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... a template <typename Tout, size_t N> KFR_INTRINSIC simd<Tout, N> simd_undefined() CMT_NOEXCEPT { - NOT_OPTIMIZED; + not_optimized(CMT_FUNC_SIGNATURE); simd<Tout, N> x; return x; } @@ -1032,7 +1132,7 @@ KFR_INTRINSIC simd<Tout, N> simd_undefined() CMT_NOEXCEPT template <typename Tout, size_t N> KFR_INTRINSIC simd<Tout, N> simd_zeros() CMT_NOEXCEPT { - NOT_OPTIMIZED; + not_optimized(CMT_FUNC_SIGNATURE); return from_simd_array<Tout, N>({ Tout() }); } @@ -1040,7 +1140,7 @@ KFR_INTRINSIC simd<Tout, N> simd_zeros() CMT_NOEXCEPT template <typename Tout, size_t N> KFR_INTRINSIC simd<Tout, N> simd_allones() CMT_NOEXCEPT { - NOT_OPTIMIZED; + not_optimized(CMT_FUNC_SIGNATURE); simd_array<Tout, N> x{}; KFR_COMPONENTWISE(x.val[i] = special_constants<Tout>::allones()); return from_simd_array(x); @@ -1056,9 +1156,9 @@ template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N KFR_ENABLE_IF(Nout == 1 || N == 1) #endif > -KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT +KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT { - NOT_OPTIMIZED; + not_optimized(CMT_FUNC_SIGNATURE); return bitcast_anything<simd<Tout, Nout>>(x); } @@ -1072,19 +1172,18 @@ template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N KFR_ENABLE_IF(Nout > 1 && N > 1) #endif > -KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT +KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT { constexpr size_t Nlow = prev_poweroftwo(N - 1); - return simd_concat( - simd_tag_v<Tout, Nlow * Nout / N, (N - Nlow) * Nout / N>, - simd_bitcast(simd_cvt_tag_v<Tout, Tin, Nlow>, - simd_shuf(simd_tag_v<Tin, N>, csizeseq<Nlow>, csizeseq<Nlow, 0, 0>, x)), - simd_bitcast(simd_cvt_tag_v<Tout, Tin, N - Nlow>, - simd_shuf(simd_tag_v<Tin, N>, csizeseq<N - Nlow, Nlow>, csizeseq<N - Nlow, 0, 0>, x))); + return simd_concat<Tout, Nlow * Nout / N, (N - Nlow) * Nout / N>( + simd_bitcast(simd_cvt_t<Tout, Tin, Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)), + simd_bitcast(simd_cvt_t<Tout, Tin, N - Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto))); } template <typename T, size_t N> -KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_tag<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT +KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT { return x; } @@ -1092,84 +1191,195 @@ KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_tag<T, T, N>, const simd<T template <typename T, size_t N, size_t index> KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT { - return simd_shuf(simd_tag_v<T, N>, csizes<index>, csizes<0>, value); + return simd_shuffle(simd_t<T, N>{}, value, csizes<index>, overload_auto); } template <typename T, size_t N, size_t index> KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) CMT_NOEXCEPT { - constexpr auto indexset = csizeseq<N> == csize<index>; - return simd_shuf(simd_tag_v<T, N>, select(indexset, csizeseq<N, 0, 0>, csizeseq<N>), - select(indexset, csizeseq<N, 1, 0>, csizeseq<N, 0, 0>), value); + not_optimized(CMT_FUNC_SIGNATURE); + simd_array<T, N> arr = to_simd_array<T, N>(value); + arr.val[index] = x; + return from_simd_array(arr); +} + +template <typename T, size_t N> +KFR_INTRINSIC const simd<T, N>& simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N>, + overload_priority<10>) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC const simd<T, N1>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>&, + csizeseq_t<N1>, overload_priority<9>) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC const simd<T, N2>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>&, const simd<T, N2>& y, + csizeseq_t<N2, N1>, overload_priority<9>) CMT_NOEXCEPT +{ + return y; +} + +// concat() +template <typename T, size_t N, + KFR_ENABLE_IF(is_poweroftwo(N) && + std::is_same_v<simd<T, N + N>, simd_halves<unwrap_bit<T>, N + N>>)> +KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y, + csizeseq_t<N + N>, overload_priority<8>) CMT_NOEXCEPT +{ + return simd<T, N + N>{ x, y }; } template <typename T> -KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_tag<T, 1>, identity<T> value) CMT_NOEXCEPT +KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT { return { static_cast<unwrap_bit<T>>(value) }; } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)> -KFR_INTRINSIC simd<T, N> simd_broadcast(simd_tag<T, N>, identity<T> value) CMT_NOEXCEPT +KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) CMT_NOEXCEPT { - return simd_concat(simd_tag_v<T, Nlow, N - Nlow>, simd_broadcast(simd_tag_v<T, Nlow>, value), - simd_broadcast(simd_tag_v<T, N - Nlow>, value)); + return simd_concat<T, Nlow, N - Nlow>(simd_broadcast(simd_t<T, Nlow>{}, value), + simd_broadcast(simd_t<T, N - Nlow>{}, value)); } -template <typename T, size_t N1> -KFR_INTRINSIC const simd<T, N1>& simd_concat(simd_tag<T, N1>, const simd<T, N1>& x) CMT_NOEXCEPT +template <typename T, size_t N, + KFR_ENABLE_IF(is_poweroftwo(N) && std::is_same_v<simd<T, N>, simd_halves<unwrap_bit<T>, N>>)> +KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2>, + overload_priority<7>) CMT_NOEXCEPT { - return x; + return x.low; +} + +template <typename T, size_t N, + KFR_ENABLE_IF(is_poweroftwo(N) && std::is_same_v<simd<T, N>, simd_halves<unwrap_bit<T>, N>>)> +KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2, N / 2>, + overload_priority<7>) CMT_NOEXCEPT +{ + return x.high; +} + +template <typename T, size_t N, size_t index> +KFR_INTRINSIC T simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<index>, + overload_priority<6>) CMT_NOEXCEPT +{ + return to_simd_array<T, N>(x).val[index]; } -// template <typename T, size_t N1, size_t N2, size_t N3, size_t N4> -// KFR_INTRINSIC simd<T, N1 + N2 + N3 + N4> simd_concat4(const simd<T, N1>& x, const simd<T, N2>& y, -// const simd<T, N3>& z, const simd<T, N4>& w) -// CMT_NOEXCEPT -// { -// return simd_shuf(simd_tag_v<T, N1 + N2, N3 + N4>, csizeseq<N1 + N2 + N3 + N4>, -// simd_shuf(simd_tag_v<T, N1, N2>, csizeseq<N1 + N2>, x, y), -// simd_shuf(simd_tag_v<T, N3, N4>, csizeseq<N3 + N4>, z, w)); -// } - -// template <typename T, size_t... Ns, size_t... idx> -// KFR_INTRINSIC simd<T, (Ns + ...)> simd_concat_ex(simd_tag<T>, csizes_t<Ns...>, -// const std::tuple<const simd<T, Ns>&...>& args, -// csizes_t<idx...>) CMT_NOEXCEPT -// { -// static_assert(sizeof...(Ns) >= 3); -// return simd_concat(simd_tag_v<T, >, std::get<idx>(args)...); -// } - -template <size_t start = 0, size_t N0, size_t... Ns, size_t I, size_t... sequence> -constexpr auto operands_builder(csizes_t<N0, Ns...>, csizes_t<I, sequence...>) -{ - if constexpr (sizeof...(Ns) == 0) +template <typename T, size_t Nout, size_t N> +simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsigned (&indices)[Nout]) +{ + simd_array<T, Nout> result; + for (size_t i = 0; i < Nout; ++i) { - return csizeseq<N0, start, 0>; + const size_t index = indices[i]; + result.val[i] = index >= N ? T() : static_cast<T>(x.val[index]); } - else + return result; +} + +template <typename T, size_t Nout, size_t N1, size_t N2> +simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd_array<T, N2>& y, + const unsigned (&indices)[Nout]) +{ + simd_array<T, Nout> result; + for (size_t i = 0; i < Nout; ++i) { - return cconcat(csizeseq<N0, start, 0>, - operands_builder<start + 1>(csizes<Ns...>, csizes<sequence...>)); + const size_t index = indices[i]; + result.val[i] = index >= N1 + N2 ? T() + : index >= N1 ? static_cast<T>(y.val[index - N1]) + : static_cast<T>(x.val[index]); } + return result; +} + +template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>, + overload_generic) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); +#ifdef CMT_COMPILER_IS_MSVC + const simd_array<T, N> xx = to_simd_array<T, N>(x); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array)); +#else + return from_simd_array<T, Nout>( + { (indices >= N ? T() : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... }); +#endif +} + +template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y, + csizes_t<indices...>, overload_generic) CMT_NOEXCEPT +{ + static_assert(N == N2, ""); + not_optimized(CMT_FUNC_SIGNATURE); +#ifdef CMT_COMPILER_IS_MSVC + const simd_array<T, N> xx = to_simd_array<T, N>(x); + const simd_array<T, N> yy = to_simd_array<T, N>(y); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array)); +#else + return from_simd_array<T, Nout>( + { (indices >= N * 2 ? T() + : indices >= N ? static_cast<T>(to_simd_array<T, N>(y).val[indices - N]) + : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... }); +#endif +} + +template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2), + size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y, + csizes_t<indices...>, overload_generic) CMT_NOEXCEPT +{ + not_optimized(CMT_FUNC_SIGNATURE); + +#ifdef CMT_COMPILER_IS_MSVC + const simd_array<T, N1> xx = to_simd_array<T, N1>(x); + const simd_array<T, N2> yy = to_simd_array<T, N2>(y); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N1, N2>(xx, yy, indices_array)); +#else + + return from_simd_array<T, Nout>( + { (indices > N1 + N2 ? T() + : indices >= N1 ? static_cast<T>(to_simd_array<T, N2>(y).val[indices - N1]) + : static_cast<T>(to_simd_array<T, N1>(x).val[indices]))... }); +#endif +} + +template <typename T, size_t N1> +KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT +{ + return x; +} + +template <typename T, size_t N1, size_t N2, size_t N3, size_t N4> +KFR_INTRINSIC simd<T, N1 + N2 + N3 + N4> simd_concat4(const simd<T, N1>& x, const simd<T, N2>& y, + const simd<T, N3>& z, const simd<T, N4>& w) CMT_NOEXCEPT +{ + return simd_shuffle(simd2_t<T, N1 + N2, N3 + N4>{}, + simd_shuffle(simd2_t<T, N1, N2>{}, x, y, csizeseq<N1 + N2>, overload_auto), + simd_shuffle(simd2_t<T, N3, N4>{}, z, w, csizeseq<N3 + N4>, overload_auto), + csizeseq<N1 + N2 + N3 + N4>, overload_auto); } template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/> -KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(simd_tag<T, N1, N2, Ns...>, const simd<T, N1>& x, - const simd<T, N2>& y, +KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y, const simd<T, Ns>&... z) CMT_NOEXCEPT { - constexpr size_t Nargs = 2 + sizeof...(Ns); - if constexpr (Nargs == 2) + if constexpr (sizeof...(Ns) == 2) { - return simd_shuf(simd_tag_v<T, N1, N2>, cconcat(csizeseq<N1>, csizeseq<N2>), - cconcat(csizeseq<N1, 0, 0>, csizeseq<N2, 1, 0>), x, y); + return simd_concat4<T, N1, N2, Ns...>(x, y, z...); } else { - return simd_shuf(simd_tag_v<T, N1, N2, Ns...>, cconcat(csizeseq<N1>, csizeseq<N2>, csizeseq<Ns>...), - operands_builder(csizes<N1, N2, Ns...>, csizeseq<Nargs>), x, y, z...); + return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...), + csizeseq<N1 + N2 + Nscount>, overload_auto); } } @@ -1182,27 +1392,26 @@ KFR_INTRINSIC simd<Tout, N> simd_convert__(const simd<Tin, N>& x, csizes_t<indic /// @brief Converts input vector to vector with subtype Tout template <typename Tout, typename Tin, KFR_ENABLE_IF(!std::is_same<Tout, Tin>::value)> -KFR_INTRINSIC simd<Tout, 1> simd_convert(simd_cvt_tag<Tout, Tin, 1>, const simd<Tin, 1>& x) CMT_NOEXCEPT +KFR_INTRINSIC simd<Tout, 1> simd_convert(simd_cvt_t<Tout, Tin, 1>, const simd<Tin, 1>& x) CMT_NOEXCEPT { return simd_make(cometa::ctype<Tout>, static_cast<Tout>(x)); } /// @brief Converts input vector to vector with subtype Tout template <typename Tout, typename Tin, size_t N> -KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_tag<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT +KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT { constexpr size_t Nlow = prev_poweroftwo(N - 1); - return simd_concat( - simd_tag_v<Tout, Nlow, N - Nlow>, - simd_convert(simd_cvt_tag_v<Tout, Tin, Nlow>, - simd_shuf(simd_tag_v<Tin, N>, csizeseq<Nlow>, csizeseq<Nlow, 0, 0>, x)), - simd_convert(simd_cvt_tag_v<Tout, Tin, N - Nlow>, - simd_shuf(simd_tag_v<Tin, N>, csizeseq<N - Nlow, Nlow>, csizeseq<N - Nlow, 0, 0>, x))); + return simd_concat<Tout, Nlow, N - Nlow>( + simd_convert(simd_cvt_t<Tout, Tin, Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)), + simd_convert(simd_cvt_t<Tout, Tin, N - Nlow>{}, + simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto))); } /// @brief Converts input vector to vector with subtype Tout template <typename T, size_t N> -KFR_INTRINSIC const simd<T, N>& simd_convert(simd_cvt_tag<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT +KFR_INTRINSIC const simd<T, N>& simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT { return x; } @@ -1230,59 +1439,24 @@ KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, } #define SIMD_TYPE_INTRIN(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO) \ - KFR_INTRINSIC T simd_shuf(simd_tag<T, N>, csizes_t<0>, csizeseq_t<N, 0, 0>, const simd<T, N>& x) \ - { \ - return TO_SCALAR; \ - } \ - KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T, 1>, concat_lists<csizes_t<0>, csizeseq_t<N - 1, 0, 0>>, \ - concat_lists<csizes_t<0>, csizeseq_t<N - 1, operand_undefined, 0>>, \ - simd<T, 1> x) \ - { \ - return FROM_SCALAR; \ - } \ - KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T, 1>, csizeseq_t<N, 0, 0>, csizeseq_t<N, 0, 0>, \ - simd<T, 1> x) \ - { \ - return FROM_BROADCAST; \ - } \ - KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T>, csizeseq_t<N>) { return FROM_ZERO; } + KFR_INTRINSIC T simd_to_scalar(simd_t<T, N>, const simd<T, N>& x) { return TO_SCALAR; } \ + KFR_INTRINSIC simd<T, N> simd_from_scalar(simd_t<T, N>, T x) { return FROM_SCALAR; } \ + KFR_INTRINSIC simd<T, N> simd_from_broadcast(simd_t<T, N>, T x) { return FROM_BROADCAST; } \ + KFR_INTRINSIC simd<T, N> simd_from_zero(simd_t<T, N>) { return FROM_ZERO; } #define SIMD_TYPE_INTRIN_EX(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO, GET_LOW, GET_HIGH, \ FROM_HALVES) \ SIMD_TYPE_INTRIN(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO) \ - KFR_INTRINSIC simd<T, N / 2> simd_shuf(simd_tag<T, N>, csizeseq_t<N / 2>, csizeseq_t<N / 2, 0, 0>, \ - const simd<T, N>& x) noexcept \ - { \ - return GET_LOW; \ - } \ - KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_tag<T, N>, const simd<T, N>& x) noexcept \ - { \ - return GET_LOW; \ - } \ - KFR_INTRINSIC simd<T, N / 2> simd_shuf(simd_tag<T, N>, csizeseq_t<N / 2, N / 2>, \ - csizeseq_t<N / 2, 0, 0>, const simd<T, N>& x) noexcept \ - { \ - return GET_HIGH; \ - } \ - KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_tag<T, N>, const simd<T, N>& x) noexcept \ - { \ - return GET_HIGH; \ - } \ - KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T, N / 2, N / 2>, \ - concat_lists<csizeseq_t<N / 2>, csizeseq_t<N / 2>>, \ - concat_lists<csizeseq_t<N / 2, 0, 0>, csizeseq_t<N / 2, 1, 0>>, \ - const simd<T, N / 2>& x, const simd<T, N / 2>& y) noexcept \ - { \ - return FROM_HALVES; \ - } \ - KFR_INTRINSIC simd<T, N> simd_from_halves(simd_tag<T, N / 2, N / 2>, const simd<T, N / 2>& x, \ - const simd<T, N / 2>& y) noexcept \ + KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_t<T, N>, const simd<T, N>& x) { return GET_LOW; } \ + KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_t<T, N>, const simd<T, N>& x) { return GET_HIGH; } \ + KFR_INTRINSIC simd<T, N> simd_from_halves(simd_t<T, N>, const simd<T, N / 2>& x, \ + const simd<T, N / 2>& y) \ { \ return FROM_HALVES; \ } template <typename T, size_t Nout, size_t Nin> -KFR_INTRINSIC simd<T, Nout> simd_from_partial(simd_tag<T, Nout, Nin>, const simd<T, Nin>& x) +KFR_INTRINSIC simd<T, Nout> simd_from_partial(simd2_t<T, Nout, Nin>, const simd<T, Nin>& x) { #ifdef CMT_COMPILER_IS_MSVC union @@ -1301,14 +1475,29 @@ KFR_INTRINSIC simd<T, Nout> simd_from_partial(simd_tag<T, Nout, Nin>, const simd return u.out; #endif } +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_t<T, N>, const simd<T, N>& x) +{ + return x.low; +} +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_t<T, N>, const simd<T, N>& x) +{ + return x.high; +} +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_from_halves(simd_t<T, N>, const simd<T, N / 2>& x, const simd<T, N / 2>& y) +{ + return { x, y }; +} -KFR_INTRINSIC simd<float, 4> simd_from_halves(simd_tag<float, 4>, const simd<float, 2>& x, +KFR_INTRINSIC simd<float, 4> simd_from_halves(simd_t<float, 4>, const simd<float, 2>& x, const simd<float, 2>& y) { return _mm_castpd_ps(_mm_setr_pd(x.whole, y.whole)); } -KFR_INTRINSIC simd<double, 2> simd_from_halves(simd_tag<double, 2>, const simd<double, 1>& x, +KFR_INTRINSIC simd<double, 2> simd_from_halves(simd_t<double, 2>, const simd<double, 1>& x, const simd<double, 1>& y) { return _mm_setr_pd(x, y); @@ -1316,14 +1505,6 @@ KFR_INTRINSIC simd<double, 2> simd_from_halves(simd_tag<double, 2>, const simd<d SIMD_TYPE_INTRIN(f32, 4, _mm_cvtss_f32(x), _mm_set_ss(x), _mm_set1_ps(x), _mm_setzero_ps()) SIMD_TYPE_INTRIN(f64, 2, _mm_cvtsd_f64(x), _mm_set_sd(x), _mm_set1_pd(x), _mm_setzero_pd()) -SIMD_TYPE_INTRIN(u8, 16, _mm_cvtsi128_si32(x), _mm_cvtsi32_si128(x), _mm_set1_epi8(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(i8, 16, _mm_cvtsi128_si32(x), _mm_cvtsi32_si128(x), _mm_set1_epi8(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(u16, 8, _mm_cvtsi128_si32(x), _mm_cvtsi32_si128(x), _mm_set1_epi16(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(i16, 8, _mm_cvtsi128_si32(x), _mm_cvtsi32_si128(x), _mm_set1_epi16(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(u32, 4, _mm_cvtsi128_si32(x), _mm_cvtsi32_si128(x), _mm_set1_epi32(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(i32, 4, _mm_cvtsi128_si32(x), _mm_cvtsi32_si128(x), _mm_set1_epi32(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(u64, 2, _mm_cvtsi128_si64(x), _mm_cvtsi64_si128(x), _mm_set1_epi64x(x), _mm_setzero_si128()) -SIMD_TYPE_INTRIN(i64, 2, _mm_cvtsi128_si64(x), _mm_cvtsi64_si128(x), _mm_set1_epi64x(x), _mm_setzero_si128()) #ifdef CMT_ARCH_AVX SIMD_TYPE_INTRIN_EX(f32, 8, _mm256_cvtss_f32(x), _mm256_castps128_ps256(_mm_set_ss(x)), _mm256_set1_ps(x), @@ -1333,38 +1514,6 @@ SIMD_TYPE_INTRIN_EX(f64, 4, _mm256_cvtsd_f64(x), _mm256_castpd128_pd256(_mm_set_ _mm256_setzero_pd(), _mm256_castpd256_pd128(x), _mm256_extractf128_pd(x, 1), KFR_mm256_setr_m128d(x, y)) #endif -#ifdef CMT_ARCH_AVX2 -SIMD_TYPE_INTRIN_EX(u8, 32, _mm_cvtsi128_si32(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi32_si128(x)), _mm256_set1_epi8(x), _mm256_setzero_si256(), - _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(i8, 32, _mm_cvtsi128_si32(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi32_si128(x)), _mm256_set1_epi8(x), _mm256_setzero_si256(), - _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(u16, 16, _mm_cvtsi128_si32(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi32_si128(x)), _mm256_set1_epi16(x), - _mm256_setzero_si256(), _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), - KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(i16, 16, _mm_cvtsi128_si32(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi32_si128(x)), _mm256_set1_epi16(x), - _mm256_setzero_si256(), _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), - KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(u32, 8, _mm_cvtsi128_si32(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi32_si128(x)), _mm256_set1_epi32(x), - _mm256_setzero_si256(), _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), - KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(i32, 8, _mm_cvtsi128_si32(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi32_si128(x)), _mm256_set1_epi32(x), - _mm256_setzero_si256(), _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), - KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(u64, 4, _mm_cvtsi128_si64(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi64_si128(x)), _mm256_set1_epi64x(x), - _mm256_setzero_si256(), _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), - KFR_mm256_setr_m128i(x, y)) -SIMD_TYPE_INTRIN_EX(i64, 4, _mm_cvtsi128_si64(_mm256_castsi256_si128(x)), - _mm256_castsi128_si256(_mm_cvtsi64_si128(x)), _mm256_set1_epi64x(x), - _mm256_setzero_si256(), _mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1), - KFR_mm256_setr_m128i(x, y)) -#endif #ifdef CMT_ARCH_AVX512 SIMD_TYPE_INTRIN_EX(f32, 16, _mm512_cvtss_f32(x), _mm512_castps128_ps512(_mm_set_ss(x)), _mm512_set1_ps(x), @@ -1378,41 +1527,21 @@ SIMD_TYPE_INTRIN_EX(f64, 8, _mm512_cvtsd_f64(x), _mm512_castpd128_pd512(_mm_set_ #ifdef CMT_ARCH_SSE2 template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<float, 4> simd_shuf(simd_tag<float, 4>, csizes_t<I0, I1, I2, I3>, csizes_t<0, 0, 0, 0>, - const simd<float, 4>& x) +KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 4>, const simd<float, 4>& x, + csizes_t<I0, I1, I2, I3>) { // SSE -> SSE return _mm_shuffle_ps(x, x, (shuffle_mask<8, I0, I1, I2, I3>::value)); } template <size_t I0, size_t I1> -KFR_INTRINSIC simd<double, 2> simd_shuf(simd_tag<double, 2>, csizes_t<I0, I1>, csizes_t<0, 0>, - const simd<double, 2>& x) +KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, csizes_t<I0, I1>) { // SSE -> SSE return _mm_shuffle_pd(x, x, (shuffle_mask<2, I0, I1>::value)); } #endif -#ifdef CMT_ARCH_SSE41 - -template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<float, 4> simd_shuf(simd_tag<float, 4, 4>, csizes_t<0, 1, 2, 3>, csizes_t<I0, I1, I2, I3>, - const simd<float, 4>& x, const simd<float, 4>& y) -{ - // SSE -> SSE - return _mm_blend_ps(x, y, (shuffle_mask<8, I0, I1, I2, I3, 0, 0, 0, 0>::value)); -} - -template <size_t I0, size_t I1> -KFR_INTRINSIC simd<double, 2> simd_shuf(simd_tag<double, 2, 2>, csizes_t<0, 1>, csizes_t<I0, I1>, - const simd<double, 2>& x, const simd<double, 2>& y) -{ - // SSE -> SSE - return _mm_blend_pd(x, y, (shuffle_mask<2, I0, I1>::value)); -} -#endif - template <uint8_t max> KFR_INTRINSIC constexpr uint8_t vec_idx(size_t value) { @@ -1424,7 +1553,7 @@ KFR_INTRINSIC constexpr uint8_t vec_idx(size_t value) template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15> KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( - simd_tag<float, 16>, const simd<float, 16>& x, + simd_t<float, 16>, const simd<float, 16>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>) { // AVX512 -> AVX512 @@ -1437,7 +1566,7 @@ KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_tag<double, 8>, const simd<double, 8>& x, +KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // AVX512 -> AVX512 @@ -1448,7 +1577,7 @@ KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_tag<double, 8>, const simd<d } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_tag<float, 16>, const simd<float, 16>& x, +KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 16>, const simd<float, 16>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // AVX512 -> AVX @@ -1461,7 +1590,7 @@ KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_tag<float, 16>, const simd<fl } template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_tag<float, 16>, const simd<float, 16>& x, +KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 16>, const simd<float, 16>& x, csizes_t<I0, I1, I2, I3>) { // AVX512 -> SSE @@ -1474,7 +1603,7 @@ KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_tag<float, 16>, const simd<fl } template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_tag<double, 8>, const simd<double, 8>& x, +KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, csizes_t<I0, I1, I2, I3>) { // AVX512 -> AVX @@ -1485,8 +1614,7 @@ KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_tag<double, 8>, const simd<d } template <size_t I0, size_t I1> -KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_tag<double, 8>, const simd<double, 8>& x, - csizes_t<I0, I1>) +KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, csizes_t<I0, I1>) { // AVX512 -> SSE return _mm512_castpd512_pd128(_mm512_permutexvar_pd( @@ -1498,7 +1626,7 @@ KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_tag<double, 8>, const simd<d template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15> KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( - simd_tag<float, 8>, const simd<float, 8>& x, + simd_t<float, 8>, const simd<float, 8>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>) { // AVX -> AVX512 @@ -1513,7 +1641,7 @@ KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15> KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( - simd_tag<float, 4>, const simd<float, 4>& x, + simd_t<float, 4>, const simd<float, 4>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>) { // SSE -> AVX512 @@ -1526,7 +1654,7 @@ KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_tag<double, 4>, const simd<double, 4>& x, +KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // AVX -> AVX512 @@ -1537,7 +1665,7 @@ KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_tag<double, 4>, const simd<d } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_tag<double, 2>, const simd<double, 2>& x, +KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // SSE -> AVX512 @@ -1552,29 +1680,31 @@ KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_tag<double, 2>, const simd<d #ifdef CMT_ARCH_AVX template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 8>, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>, - csizeseq_t<8, 0, 0>, const simd<float, 8>& x) +KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 8>, const simd<float, 8>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // AVX -> AVX if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && csizes<I0, I1, I2, I3>.equal(csizes<I4, I5, I6, I7>)) { - const simd<float, 4> tmp = simd_shuf(simd_tag_v<float, 4>, csizes<I0, I1, I2, I3>, csizes<0, 0, 0, 0>, - simd_get_low(simd_tag_v<float, 8>, x)); - return simd_from_halves(simd_tag_v<float, 8>, tmp, tmp); + const simd<float, 4> tmp = universal_shuffle(simd_t<float, 4>{}, simd_get_low(simd_t<float, 8>{}, x), + csizes<I0, I1, I2, I3>); + return simd_from_halves(simd_t<float, 8>{}, tmp, tmp); } else if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && cminof(csizes<I4, I5, I6, I7>) >= 4) { - if constexpr (csizes<I4, I5, I6, I7>.equal(csizes<I0 + 4, I1 + 4, I2 + 4, I3 + 4>)) + if constexpr (csizes<I0, I1, I2, I3, I4, I5, I6, I7>.equal( + csizes<I0, I1, I2, I3, I0 + 4, I1 + 4, I2 + 4, I3 + 4>)) { return _mm256_shuffle_ps(x, x, (shuffle_mask<8, I0, I1, I2, I3>::value)); } else { - return simd_from_halves(simd_tag_v<float, 8>, - simd_shuf(simd_tag_v<float, 4>, simd_get_low(simd_tag_v<float, 8>, x), - csizes<I0, I1, I2, I3>), - simd_shuf(simd_tag_v<float, 4>, simd_get_high(simd_tag_v<float, 8>, x), - csizes<I4 - 4, I5 - 4, I6 - 4, I7 - 4>)); + return simd_from_halves(simd_t<float, 8>{}, + universal_shuffle(simd_t<float, 4>{}, simd_get_low(simd_t<float, 8>{}, x), + csizes<I0, I1, I2, I3>), + universal_shuffle(simd_t<float, 4>{}, + simd_get_high(simd_t<float, 8>{}, x), + csizes<I4, I5, I6, I7>)); } } else @@ -1591,19 +1721,30 @@ KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 8>, csizes_t<I0, I1, I2, } template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 4>, csizes_t<I0, I1, I2, I3>, csizeseq_t<4, 0, 0>, - const simd<double, 4>& x) +KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, + csizes_t<I0, I1, I2, I3>) { // AVX -> AVX if constexpr (cmaxof(csizes<I0, I1>) < 2 && csizes<I0, I1>.equal(csizes<I2, I3>)) { - const simd<double, 2> tmp = simd_shuf(simd_tag_v<double, 2>, csizes<I0, I1>, csizes<0, 0>, - simd_get_low(simd_tag_v<double, 4>, x)); - return simd_from_halves(simd_tag_v<double, 4>, tmp, tmp); + const simd<double, 2> tmp = + universal_shuffle(simd_t<double, 2>{}, simd_get_low(simd_t<double, 4>{}, x), csizes<I0, I1>); + return simd_from_halves(simd_t<double, 4>{}, tmp, tmp); } - else if constexpr (I0 < 2 && I1 < 2 && I2 >= 2 && I3 >= 2 && I2 == I0 + 2 && I3 == I1 + 2) + else if constexpr (cmaxof(csizes<I0, I1>) < 4 && cminof(csizes<I2, I3>) >= 4) { - return _mm256_shuffle_pd(x, x, (shuffle_mask<4, I0, I1, I2 - 2, I3 - 2>::value)); + if constexpr (csizes<I0, I1, I2, I3>.equal(csizes<I0, I1, I2 + 2, I3 + 2>)) + { + return _mm256_shuffle_pd(x, x, (shuffle_mask<2, I0, I1>::value)); + } + else + { + return simd_from_halves( + simd_t<double, 4>{}, + universal_shuffle(simd_t<double, 2>{}, simd_get_low(simd_t<double, 4>{}, x), csizes<I0, I1>), + universal_shuffle(simd_t<double, 2>{}, simd_get_high(simd_t<double, 4>{}, x), + csizes<I2, I3>)); + } } else { @@ -1617,20 +1758,20 @@ KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 4>, csizes_t<I0, I1, I2 } template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<float, 4> simd_shuf(simd_tag<float, 8>, csizes_t<I0, I1, I2, I3>, csizeseq_t<4, 0, 0>, - const simd<float, 8>& x) +KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 8>, const simd<float, 8>& x, + csizes_t<I0, I1, I2, I3>) { // AVX -> SSE if constexpr (I0 % 4 == 0 && I1 % 4 == 1 && I2 % 4 == 2 && I3 % 4 == 3) { - __m128 t1 = simd_get_low(simd_tag_v<float, 8>, x); - __m128 t2 = simd_get_high(simd_tag_v<float, 8>, x); + __m128 t1 = simd_get_low(simd_t<float, 8>{}, x); + __m128 t2 = simd_get_high(simd_t<float, 8>{}, x); return _mm_blend_ps(t1, t2, (shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value)); } else { - __m128 t1 = simd_get_low(simd_tag_v<float, 8>, x); - __m128 t2 = simd_get_high(simd_tag_v<float, 8>, x); + __m128 t1 = simd_get_low(simd_t<float, 8>{}, x); + __m128 t2 = simd_get_high(simd_t<float, 8>{}, x); t1 = _mm_permute_ps(t1, (shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value)); t2 = _mm_permute_ps(t2, (shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value)); return _mm_blend_ps(t1, t2, (shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value)); @@ -1638,55 +1779,28 @@ KFR_INTRINSIC simd<float, 4> simd_shuf(simd_tag<float, 8>, csizes_t<I0, I1, I2, } template <size_t I0, size_t I1> -KFR_INTRINSIC simd<double, 2> simd_shuf(simd_tag<double, 4>, csizes_t<I0, I1>, csizeseq_t<2, 0, 0>, - const simd<double, 4>& x) +KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, csizes_t<I0, I1>) { // AVX -> SSE if constexpr (I0 % 2 == 0 && I1 % 2 == 1) { - __m128d t1 = simd_get_low(simd_tag_v<double, 4>, x); - __m128d t2 = simd_get_high(simd_tag_v<double, 4>, x); + __m128d t1 = simd_get_low(simd_t<double, 4>{}, x); + __m128d t2 = simd_get_high(simd_t<double, 4>{}, x); return _mm_blend_pd(t1, t2, (shuffle_mask<2, I0 / 2, I1 / 2>::value)); } else { - __m128d t1 = simd_get_low(simd_tag_v<double, 4>, x); - __m128d t2 = simd_get_high(simd_tag_v<double, 4>, x); + __m128d t1 = simd_get_low(simd_t<double, 4>{}, x); + __m128d t2 = simd_get_high(simd_t<double, 4>{}, x); t1 = _mm_permute_pd(t1, (shuffle_mask<2, I0 % 2, I1 % 2>::value)); t2 = _mm_permute_pd(t2, (shuffle_mask<2, I0 % 2, I1 % 2>::value)); return _mm_blend_pd(t1, t2, (shuffle_mask<2, I0 / 2, I1 / 2>::value)); } } -KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 4, 4>, csizes_t<0, 2, 0, 2> idx, - csizes_t<0, 0, 1, 1>, const simd<double, 4>& x, - const simd<double, 4>& y) -{ - auto tmp1 = _mm256_permute2f128_pd(x, y, (0 << 0) | (2 << 4)); - auto tmp2 = _mm256_permute2f128_pd(x, y, (1 << 0) | (3 << 4)); - return _mm256_unpacklo_pd(tmp1, tmp2); -} -KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 4, 4>, csizes_t<1, 3, 1, 3> idx, - csizes_t<0, 0, 1, 1>, const simd<double, 4>& x, - const simd<double, 4>& y) -{ - auto tmp1 = _mm256_permute2f128_pd(x, y, (0 << 0) | (2 << 4)); - auto tmp2 = _mm256_permute2f128_pd(x, y, (1 << 0) | (3 << 4)); - return _mm256_unpackhi_pd(tmp1, tmp2); -} - -template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 8>, csizes_t<I0, I1, I2, I3> idx, - csizeseq_t<4, 0, 0>, const simd<double, 8>& x) -{ - // AVX*2 -> AVX - return simd_shuf(simd_tag_v<double, 4, 4>, csizes<I0 % 4, I1 % 4, I2 % 4, I3 % 4>, - csizes<I0 / 4, I1 / 4, I2 / 4, I3 / 4>, x.low, x.high); -} - template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 4>, csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>, - csizeseq_t<8, 0, 0>, const simd<float, 4>& x) +KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 4>, const simd<float, 4>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // SSE -> AVX return KFR_mm256_setr_m128(_mm_shuffle_ps(x, x, (shuffle_mask<8, I0, I1, I2, I3>::value)), @@ -1694,332 +1808,111 @@ KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 4>, csizes_t<I0, I1, I2, } template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 2>, csizes_t<I0, I1, I2, I3>, csizeseq_t<4, 0, 0>, - const simd<double, 2>& x) +KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, + csizes_t<I0, I1, I2, I3>) { // SSE -> AVX return KFR_mm256_setr_m128d(_mm_shuffle_pd(x, x, (shuffle_mask<2, I0, I1>::value)), _mm_shuffle_pd(x, x, (shuffle_mask<2, I2, I3>::value))); } -template <size_t I0> -KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 8, 8>, csizes_t<0, 1, 2, 3, 4, 5, 6, 7>, - csizes_t<I0, I0, I0, I0, I0, I0, I0, I0>, const simd<float, 8>& x, - const simd<float, 8>& y) -{ - if constexpr (I0 == 0) - return x; - else - return y; -} - -template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<float, 8> simd_shuf(simd_tag<float, 8, 8>, csizes_t<0, 1, 2, 3, 4, 5, 6, 7>, - csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>, const simd<float, 8>& x, - const simd<float, 8>& y) -{ - // AVX -> AVX - return _mm256_blend_ps(x, y, (shuffle_mask<8, I0, I1, I2, I3, I4, I5, I6, I7>::value)); -} - -template <size_t I0, size_t I1, size_t I2, size_t I3> -KFR_INTRINSIC simd<double, 4> simd_shuf(simd_tag<double, 4, 4>, csizes_t<0, 1, 2, 3>, - csizes_t<I0, I1, I2, I3>, const simd<double, 4>& x, - const simd<double, 4>& y) -{ - // AVX -> AVX - return _mm256_blend_pd(x, y, (shuffle_mask<4, I0, I1, I2, I3>::value)); -} - -/*KFR_INTRINSIC simd<double, 32> simd_shuf( - simd_tag<double, 32>, - csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, - 15, 22, 23, 30, 31>, - csizes_t<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>, - const simd<double, 32>& x) -{ - auto tmp0 = x.low.low.low; - auto tmp1 = x.low.low.high; - auto tmp2 = x.low.high.low; - auto tmp3 = x.low.high.high; - auto tmp4 = x.high.low.low; - auto tmp5 = x.high.low.high; - auto tmp6 = x.high.high.low; - auto tmp7 = x.high.high.high; - auto tmp8 = _mm256_permute2f128_pd(tmp7, tmp6, 32); - tmp6 = _mm256_permute2f128_pd(tmp7, tmp6, 49); - tmp7 = _mm256_permute2f128_pd(tmp5, tmp4, 32); - tmp4 = _mm256_permute2f128_pd(tmp5, tmp4, 49); - tmp5 = _mm256_permute2f128_pd(tmp0, tmp2, 32); - tmp0 = _mm256_permute2f128_pd(tmp0, tmp2, 49); - tmp2 = _mm256_permute2f128_pd(tmp1, tmp3, 32); - tmp1 = _mm256_permute2f128_pd(tmp1, tmp3, 49); - return { - { { tmp5, tmp8 }, { tmp0, tmp6 } }, - { { tmp2, tmp7 }, { tmp1, tmp4 } }, - }; -}*/ - #endif -#ifdef CMT_ARCH_AVX2 - -template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<u32, 8> simd_shuf(simd_tag<u32, 8, 8>, csizes_t<0, 1, 2, 3, 4, 5, 6, 7>, - csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>, const simd<u32, 8>& x, - const simd<u32, 8>& y) -{ - // AVX -> AVX - return _mm256_blend_epi32(x, y, (shuffle_mask<8, I0, I1, I2, I3, I4, I5, I6, I7>::value)); -} -template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> -KFR_INTRINSIC simd<i32, 8> simd_shuf(simd_tag<i32, 8, 8>, csizes_t<0, 1, 2, 3, 4, 5, 6, 7>, - csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>, const simd<i32, 8>& x, - const simd<i32, 8>& y) -{ - // AVX -> AVX - return _mm256_blend_epi32(x, y, (shuffle_mask<8, I0, I1, I2, I3, I4, I5, I6, I7>::value)); -} - -#endif - -constexpr size_t decode_operand(size_t index, csizes_t<>) { return operand_undefined; } -constexpr size_t decode_offset(size_t index, csizes_t<>) { return 0; } - -template <size_t N0, size_t... Ns> -constexpr size_t decode_operand(size_t index, csizes_t<N0, Ns...> v) -{ - if (index >= (Ns + ... + N0)) - return operand_undefined; - return index < N0 ? 0 : 1 + decode_operand(index - N0, csizes_t<Ns...>{}); -} -template <size_t N0, size_t... Ns> -constexpr size_t decode_offset(size_t index, csizes_t<N0, Ns...>) -{ - return index < N0 ? index : decode_offset(index - N0, csizes_t<Ns...>{}); -} - -template <typename T, size_t... Ns, size_t... indices, size_t Nargs = sizeof...(Ns), - size_t Nout = sizeof...(indices)> -KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_tag<T, Ns...> s, csizes_t<indices...>, - const simd<T, Ns>&... xs) CMT_NOEXCEPT +template <typename T, size_t Nin, size_t... indices, size_t Nout> +KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizes_t<indices...>) { - return simd_shuf(s, csizes<decode_offset(indices, csizes<Ns...>)...>, - csizes<decode_operand(indices, csizes<Ns...>)...>, xs...); -} + using Indices = csizes_t<indices...>; -template <typename T, size_t N> -KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T, N, N>, csizeseq_t<N>, csizeseq_t<N, 0, 0>, const simd<T, N>& x, - const simd<T, N>& y) CMT_NOEXCEPT -{ - return x; -} -template <typename T, size_t N> -KFR_INTRINSIC simd<T, N> simd_shuf(simd_tag<T, N, N>, csizeseq_t<N>, csizeseq_t<N, 1, 0>, const simd<T, N>& x, - const simd<T, N>& y) CMT_NOEXCEPT -{ - return y; -} + constexpr bool floating = typeclass<T> == datatype::f; -template <typename T, size_t N0, size_t... Ns, size_t... offsets, size_t... operands, - size_t Nargs = 1 + sizeof...(Ns), size_t Nout = sizeof...(offsets)> -KFR_INTRINSIC simd<T, Nout> simd_shuf(fallback_simd_tag<T, N0, Ns...>, csizes_t<offsets...>, - csizes_t<operands...>, const simd<T, N0>& x0, - const simd<T, Ns>&... xs) CMT_NOEXCEPT -{ - using Offsets = csizes_t<offsets...>; - using Operands = csizes_t<operands...>; constexpr size_t minwidth = minimum_vector_width<T>; constexpr size_t maxwidth = vector_width<T>; - constexpr size_t Nin = N0; -#if 0 - simd_array<T, Nout> result; - auto tup = std::forward_as_tuple(x0, xs...); - constexpr csizes_t<N0, Ns...> sizes{}; - constexpr csizes_t<offsets...> offsets_{}; - constexpr csizes_t<flush_op(operands)...> operands_{}; - cforeach(csizeseq<Nout>, - [&](auto idx_) - { - constexpr size_t idx = val_of(decltype(idx_)()); - result.val[idx] = - to_simd_array<T, sizes[csize<operands_[csize<idx>]>]>(std::get<operands_[csize<idx>]>(tup)) - .val[offsets_[csize<idx>]]; - }); - return from_simd_array<T, Nout>(result); -#else + constexpr size_t minindex = cminof(Indices{}); + constexpr size_t maxindex = cmaxof(csizes<(indices >= Nin ? 0 : indices)...>); - if constexpr (Operands{}.equal(csizes<Nout, operand_undefined, 0>)) + if constexpr (Nin == 1 && Nout == 1) { - constexpr size_t maxwidth = vector_width<T>; - constexpr size_t minwidth = minimum_vector_width<T>; - if constexpr (!is_poweroftwo(Nout)) + return x; + } + else if constexpr (next_poweroftwo(Nin) == next_poweroftwo(Nout) && Indices{}.equal(csizeseq<Nout>)) + { + return x; + } + else if constexpr (!is_poweroftwo(Nin) || !is_poweroftwo(Nout)) + { + // Fix if not power of two + return universal_shuffle( + simd_t<T, next_poweroftwo(Nin)>{}, x, + cconcat(Indices{}, csizeseq<next_poweroftwo(Nout) - Nout, index_undefined, 0>)); + } + else if constexpr (Nout < minwidth) + { + // Expand indices if less than vector + const simd<T, minwidth> tmp = universal_shuffle( + simd_t<T, Nin>{}, x, cconcat(Indices{}, csizeseq<minwidth - Nout, index_undefined, 0>)); + + if constexpr (Nout == 1) { - // align to the next power of two - constexpr size_t Nnext = next_poweroftwo(Nout); - return simd_shuf(simd_tag_v<T>, csizeseq<Nnext, 0, 0>, csizeseq<Nnext, operand_undefined, 0>); + return simd_to_scalar(simd_t<T, minwidth>{}, tmp); } - else if (Nout < minwidth) + else { - return simd<T, Nout>{}; + union + { + simd<T, minwidth> tmp; + simd<T, Nout> r; + } u{ tmp }; + return u.r; } - else if constexpr (Nout > maxwidth) + } + else if constexpr (Nout > maxwidth) + { + auto lowi = Indices{}[csizeseq<Nout / 2, 0>]; + auto highi = Indices{}[csizeseq<Nout / 2, Nout / 2>]; + if constexpr (lowi.equal(highi)) { - // divide by halves - constexpr size_t Nhalf = Nout / 2; - auto tmp = simd_shuf(simd_tag_v<T>, csizeseq<Nhalf, 0, 0>, csizeseq<Nhalf, operand_undefined, 0>); + auto tmp = universal_shuffle(simd_t<T, Nin>{}, x, lowi); return { tmp, tmp }; } else { - static_assert(Nout == 0, "Required intrinsic is not defined"); - return {}; + return { universal_shuffle(simd_t<T, Nin>{}, x, lowi), + universal_shuffle(simd_t<T, Nin>{}, x, highi) }; } } - else if constexpr (!(is_poweroftwo(N0) && ... && is_poweroftwo(Ns)) || !is_poweroftwo(Nout)) + else if constexpr (minindex >= Nin) { - // inputs or outputs are not aligned to power of two size, extend them - // internally all vectors have storage with aligned size, so just fixing the tag - // and extending offsets/operands - return simd_shuf(simd_tag_v<T, next_poweroftwo(N0), next_poweroftwo(Ns)...>, - cconcat(Offsets{}, csizeseq<next_poweroftwo(Nout) - Nout, 0, 0>), - cconcat(Operands{}, csizeseq<next_poweroftwo(Nout) - Nout, operand_undefined, 0>), - x0, xs...); + return simd_from_zero(simd_t<T, Nout>{}); } - else if constexpr (Nargs == 2 && Nout > maxwidth && Nout == N0 && ((N0 == Ns) && ...) && - Offsets{}.equal(csizes<Nout * 2> % csize<Nout>) && - Operands{}.equal(csizes<Nout * 2> / csize<Nout>)) + else if constexpr (Nin == 1) { - // concat of two equal vectors - return { x0, xs... }; + return simd_from_broadcast(simd_t<T, Nout>{}, x); } - else if constexpr (Nout > maxwidth) + else if constexpr (Nin < minwidth) { - static_assert(Nout % 2 == 0); - // output vector size is larger than maximum vector size - // internally it consists of two vectors of half size - // do the shuffle on each half and concat result - constexpr auto looffs = Offsets{}[csizeseq<Nout / 2, 0>]; - constexpr auto hioffs = Offsets{}[csizeseq<Nout / 2, Nout / 2>]; - constexpr auto looper = Operands{}[csizeseq<Nout / 2, 0>]; - constexpr auto hioper = Operands{}[csizeseq<Nout / 2, Nout / 2>]; - // check if left and right halves are equal - if constexpr (looffs.equal(hioffs) && looper.equal(hioper)) - { - // do the shuffle only once - auto tmp = simd_shuf(simd_tag_v<T, N0, Ns...>, looffs, looper, x0, xs...); - return { tmp, tmp }; - } - else - { - return { simd_shuf(simd_tag_v<T, N0, Ns...>, looffs, looper, x0, xs...), - simd_shuf(simd_tag_v<T, N0, Ns...>, hioffs, hioper, x0, xs...) }; - } + return universal_shuffle(simd_t<T, minwidth>{}, simd_from_partial(simd2_t<T, minwidth, Nin>{}, x), + Indices{}); } - else if constexpr (Nout < minwidth && !Offsets{}.equal(csizes<Nout>) && - !Operands{}.equal(csizeseq<Nout, 0, 0>)) + else if constexpr (Nin > Nout && maxindex < Nin / 2) { - // output vector size is smaller than min vector size - // do the shuffle as if we had large enough vector - const simd<T, minwidth> tmp = - simd_shuf(simd_tag_v<T, N0, Ns...>, cconcat(Offsets{}, csizeseq<minwidth - Nout, 0, 0>), - cconcat(Operands{}, csizeseq<minwidth - Nout, operand_undefined, 0>), x0, xs...); - // then trim it (*) - return simd_shuf(simd_tag_v<T, minwidth>, csizeseq<Nout>, csizeseq<Nout, 0, 0>, tmp); + return universal_shuffle(simd_t<T, Nin / 2>{}, simd_get_low(simd_t<T, Nin>{}, x), Indices{}); } - else if constexpr (Nargs >= 2) + else if constexpr (Nin > Nout && minindex >= Nin / 2) { - constexpr size_t opmin = cminof(Operands{}); - constexpr size_t opmax = cmaxof(Operands{}); - constexpr size_t Nmin = cminof(csizes<N0, Ns...>); - constexpr size_t Nmax = cmaxof(csizes<N0, Ns...>); - if constexpr (opmin == opmax && opmax != operand_undefined) - { - // only one operand is actually used - auto tup = std::forward_as_tuple(x0, xs...); - constexpr auto ns = csizes<N0, Ns...>; - // dropping other operands - return simd_shuf(simd_tag_v<T, ns[csize<opmin>]>, Offsets{}, csizeseq<Nout, 0, 0>, - std::get<opmin>(tup)); - } - else if constexpr (Nargs > 2 && opmax == opmin + 1 && opmin != operand_undefined && - opmax != operand_undefined) - { - // only two consecutive operands are actually used - auto tup = std::forward_as_tuple(x0, xs...); - constexpr auto ns = csizes<N0, Ns...>; - // dropping other operands - return simd_shuf(simd_tag_v<T, ns[csize<opmin>], ns[csize<opmax>]>, Offsets{}, - select(Operands{} == csize<opmin>, csizeseq<Nout, 0, 0>, csizeseq<Nout, 1, 0>), - std::get<opmin>(tup), std::get<opmax>(tup)); - } - else if constexpr (Nmin == Nmax && Nmin > maxwidth && Nmin > Nout && cmaxof(Offsets{}) < Nmin / 2) - { - return simd_shuf(simd_tag_v<T, Nmin / 2, (Ns / 2)...>, Offsets{}, Operands{}, x0.low, xs.low...); - } - else if constexpr (Nmin == Nmax && Nmin > maxwidth && Nmin > Nout && cminof(Offsets{}) >= Nmin / 2) - { - return simd_shuf(simd_tag_v<T, Nmin / 2, (Ns / 2)...>, csizes<(offsets - csize<Nmin / 2>)...>, - Operands{}, x0.high, xs.high...); - } - else - { - NOT_OPTIMIZED; - auto tup = std::forward_as_tuple(x0, xs...); - constexpr auto ns = csizes<N0, Ns...>; - return from_simd_array<T, Nout>( - simd_array<T, Nout>{ simd_get_element<T, ns[csize<flush_op(operands)>]>( - std::get<flush_op(operands)>(tup), offsets)... }); - } + return universal_shuffle(simd_t<T, Nin / 2>{}, simd_get_high(simd_t<T, Nin>{}, x), + csizes<(indices < Nin ? indices - csize<Nin / 2> : indices)...>); + } + else if constexpr (Nin >= minwidth && Nin <= maxwidth && Nout >= minwidth && Nout <= maxwidth) + { + return simd_vec_shuffle(simd_t<T, Nin>{}, x, Indices{}); } else { - if constexpr (Nin == Nout && Offsets{}.equal(csizeseq<Nin>)) - { - return x0; - } - else if constexpr (Nout == 1) - { - // TODO: optimize - constexpr size_t offset = (offsets + ... + 0); - return to_simd_array<T, Nin>(x0).val[offset >= Nin ? 0 : offset]; - } - else if constexpr (Nin < minwidth && Nout > Nin && - Offsets{}.equal( - cconcat(csizeseq<Nin>, csizeseq<(Nout > Nin ? Nout - Nin : 0), 0, 0>)) && - Operands{}.equal( - cconcat(csizeseq<Nin, 0, 0>, - csizeseq<(Nout > Nin ? Nout - Nin : 0), operand_undefined, 0>))) - { - // extending - static_assert(Nout == 0, "Required intrinsic is not defined"); - return {}; - } - else if constexpr (Nin < minwidth) - { - // x0 is smaller than min vector size, extend it (*) - auto tmp = - simd_shuf(simd_tag_v<T, Nin>, cconcat(csizeseq<Nin>, csizeseq<minwidth - Nin, 0, 0>), - cconcat(csizeseq<Nin, 0, 0>, csizeseq<minwidth - Nin, operand_undefined, 0>), x0); - // then do the shuffle on vector - return simd_shuf(simd_tag_v<T, minwidth>, Offsets{}, Operands{}, tmp); - } - else if constexpr (Nin > maxwidth && Nin > Nout && cmaxof(Offsets{}) < Nin / 2) - { - return simd_shuf(simd_tag_v<T, Nin / 2>, Offsets{}, Operands{}, x0.low); - } - else if constexpr (Nin > maxwidth && Nin > Nout && cminof(Offsets{}) >= Nin / 2) - { - return simd_shuf(simd_tag_v<T, Nin / 2>, csizes<(offsets - csize<Nin / 2>)...>, Operands{}, - x0.high); - } - else - { - NOT_OPTIMIZED; - return from_simd_array<T, Nout>(simd_array<T, Nout>{ simd_get_element<T, Nin>(x0, offsets)... }); - } + not_optimized(CMT_FUNC_SIGNATURE); + const simd_array<T, Nin> xx = to_simd_array<T, Nin>(x); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, Nin>(xx, indices_array)); } -#endif } } // namespace intrinsics diff --git a/include/kfr/simd/impl/basicoperators_clang.hpp b/include/kfr/simd/impl/basicoperators_clang.hpp @@ -44,7 +44,7 @@ KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x) template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>)> KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x) { - return simd_bitcast(simd_cvt_tag_v<T, utype<T>, N>, ~simd_bitcast(simd_cvt_tag_v<utype<T>, T, N>, x.v)); + return simd_bitcast(simd_cvt_t<T, utype<T>, N>{}, ~simd_bitcast(simd_cvt_t<utype<T>, T, N>{}, x.v)); } #define KFR_OP_SCALAR2(fn, op, resultprefix, operprefix, soperprefix) \ diff --git a/include/kfr/simd/impl/select.hpp b/include/kfr/simd/impl/select.hpp @@ -176,6 +176,7 @@ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = v KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c))); + // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> @@ -187,7 +188,7 @@ KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const T& c) { - return concat(select(a.h.low, b, c), select(a.h.high, b, c)); + return concat2(select(a.h.low, b, c), select(a.h.high, b, c)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> @@ -199,7 +200,7 @@ KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, cons template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const T& c) { - return concat(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c)); + return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))> @@ -212,7 +213,7 @@ KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const T& b, const vec<T, N>& c) { - return concat(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high)); + return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high)); } #elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS @@ -276,7 +277,7 @@ KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, cons template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void> KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& a, const vec<T, N>& b, const vec<T, N>& c) { - return concat(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); + return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high)); } template <typename T, size_t N> KFR_INTRINSIC vec<T, N> select(const vec<bit<T>, N>& m, const T& x, const T& y) diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp @@ -34,30 +34,26 @@ inline namespace CMT_ARCH_NAME namespace intrinsics { -template <typename T, size_t... Ns> -struct fallback_simd_tag +template <typename T, size_t N> +struct simd_t { using value_type = T; - constexpr static size_t sizes[sizeof...(Ns)]{ Ns... }; + constexpr static size_t size() { return N; } }; -template <typename T, size_t... Ns> -struct simd_tag : fallback_simd_tag<T, Ns...> +template <typename T, size_t N1, size_t N2> +struct simd2_t { -}; + using value_type = T; -template <typename T, size_t... Ns> -constexpr inline const simd_tag<T, Ns...> simd_tag_v{}; + constexpr static size_t size1() { return N1; } -template <typename T> -struct simd_tag<T> -{ - using value_type = T; + constexpr static size_t size2() { return N2; } }; template <typename Tout, typename Tin, size_t N> -struct simd_cvt_tag +struct simd_cvt_t { using value_type_out = Tout; using value_type_in = Tin; @@ -65,9 +61,6 @@ struct simd_cvt_tag constexpr static size_t size() { return N; } }; -template <typename Tout, typename Tin, size_t N> -constexpr inline const simd_cvt_tag<Tout, Tin, N> simd_cvt_tag_v{}; - template <typename T, size_t N> constexpr size_t alignment() { diff --git a/include/kfr/simd/shuffle.hpp b/include/kfr/simd/shuffle.hpp @@ -68,8 +68,28 @@ KFR_INTRINSIC vec_shape<T, Nout> high(vec_shape<T, N>) template <typename T, size_t... Ns> KFR_INTRINSIC vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) CMT_NOEXCEPT { - return vec<T, csum<size_t, Ns...>()>(intrinsics::simd_concat( - intrinsics::simd_tag_v<typename vec<T, 1>::scalar_type, vec<T, Ns>::scalar_size()...>, vs.v...)); + return vec<T, csum<size_t, Ns...>()>( + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, Ns>::scalar_size()...>(vs.v...)); +} + +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC vec<T, N1 + N2> concat2(const vec<T, N1>& x, const vec<T, N2>& y) CMT_NOEXCEPT +{ + return vec<T, csum<size_t, N1, N2>()>( + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N1>::scalar_size(), + vec<T, N2>::scalar_size()>(x.v, y.v)); +} + +template <typename T, size_t N> +KFR_INTRINSIC vec<T, N * 4> concat4(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, + const vec<T, N>& d) CMT_NOEXCEPT +{ + return intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N * 2>::scalar_size(), + vec<T, N * 2>::scalar_size()>( + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(), + vec<T, N>::scalar_size()>(a.v, b.v), + intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(), + vec<T, N>::scalar_size()>(c.v, d.v)); } template <size_t count, typename T, size_t N, size_t Nout = N* count> diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp @@ -282,7 +282,7 @@ struct alignas(internal::vec_alignment<T, N_>) vec template <typename U, KFR_ENABLE_IF(std::is_convertible_v<U, value_type>&& compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT - : v(intrinsics::simd_broadcast(intrinsics::simd_tag_v<unwrap_bit<ST>, SN>, + : v(intrinsics::simd_broadcast(intrinsics::simd_t<unwrap_bit<ST>, SN>{}, static_cast<unwrap_bit<ST>>(static_cast<ST>(s)))) { } @@ -290,8 +290,9 @@ struct alignas(internal::vec_alignment<T, N_>) vec template <typename U, KFR_ENABLE_IF(std::is_convertible_v<U, value_type> && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT - : v(intrinsics::simd_shuffle(intrinsics::simd_tag_v<unwrap_bit<ST>, SW>, csizeseq<SN> % csize<SW>, - internal::compoundcast<T>::to_flat(static_cast<T>(s)).v)) + : v(intrinsics::simd_shuffle(intrinsics::simd_t<unwrap_bit<ST>, SW>{}, + internal::compoundcast<T>::to_flat(static_cast<T>(s)).v, + csizeseq<SN> % csize<SW>, overload_auto)) { } @@ -304,8 +305,7 @@ struct alignas(internal::vec_alignment<T, N_>) vec template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT - : v(intrinsics::simd_concat( - intrinsics::simd_tag_v<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>, + : v(intrinsics::simd_concat<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>( internal::compoundcast<T>::to_flat(s0).v, internal::compoundcast<T>::to_flat(s1).v, internal::compoundcast<T>::to_flat(static_cast<T>(rest)).v...)) { @@ -316,7 +316,7 @@ struct alignas(internal::vec_alignment<T, N_>) vec (compound_type_traits<T>::is_scalar && !is_bit<U>))> KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT : v(intrinsics::simd_convert( - intrinsics::simd_cvt_tag_v<unwrap_bit<ST>, unwrap_bit<deep_subtype<U>>, SN>, x.v)) + intrinsics::simd_cvt_t<unwrap_bit<ST>, unwrap_bit<deep_subtype<U>>, SN>{}, x.v)) { } @@ -343,7 +343,7 @@ struct alignas(internal::vec_alignment<T, N_>) vec // from list of vectors template <size_t... Ns, typename = std::enable_if_t<csum<size_t, Ns...>() == N>> KFR_MEM_INTRINSIC vec(const vec<T, Ns>&... vs) CMT_NOEXCEPT - : v(intrinsics::simd_concat(intrinsics::simd_tag_v<ST, (SW * Ns)...>, vs.v...)) + : v(intrinsics::simd_concat<ST, (SW * Ns)...>(vs.v...)) { } @@ -362,15 +362,15 @@ struct alignas(internal::vec_alignment<T, N_>) vec KFR_MEM_INTRINSIC static vec frombits(const vec<U, M>& v) CMT_NOEXCEPT { return intrinsics::simd_bitcast( - intrinsics::simd_cvt_tag_v<ST, typename vec<U, M>::scalar_type, vec<U, M>::scalar_size()>, v.v); + intrinsics::simd_cvt_t<ST, typename vec<U, M>::scalar_type, vec<U, M>::scalar_size()>{}, v.v); } // shuffle template <size_t... indices> KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...> i) const CMT_NOEXCEPT { - return vec<value_type, sizeof...(indices)>( - intrinsics::simd_shuffle(intrinsics::simd_tag_v<unwrap_bit<ST>, SN>, scale<SW>(i), v)); + return vec<value_type, sizeof...(indices)>(intrinsics::simd_shuffle( + intrinsics::simd_t<unwrap_bit<ST>, SN>{}, v, scale<SW>(i), overload_auto)); } template <size_t... indices> @@ -378,7 +378,7 @@ struct alignas(internal::vec_alignment<T, N_>) vec csizes_t<indices...> i) const CMT_NOEXCEPT { return vec<value_type, sizeof...(indices)>( - intrinsics::simd_shuffle(intrinsics::simd_tag_v<ST, SN, SN>, scale<SW>(i), v, y.v)); + intrinsics::simd_shuffle(intrinsics::simd2_t<ST, SN, SN>{}, v, y.v, scale<SW>(i), overload_auto)); } // element access @@ -431,7 +431,7 @@ struct alignas(internal::vec_alignment<T, N_>) vec KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT { return internal::compoundcast<T>::from_flat(intrinsics::simd_shuffle( - intrinsics::simd_tag_v<unwrap_bit<ST>, SN>, csizeseq<SW, SW * index>, v)); + intrinsics::simd_t<unwrap_bit<ST>, SN>{}, v, csizeseq<SW, SW * index>, overload_auto)); } template <size_t index> @@ -585,14 +585,14 @@ template <typename T, size_t N, size_t... indices> KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevector(const vec<T, N>& x, csizes_t<indices...> i) CMT_NOEXCEPT { - return intrinsics::simd_shuffle(intrinsics::simd_tag_v<unwrap_bit<T>, N>, i, x.v); + return intrinsics::simd_shuffle(intrinsics::simd_t<unwrap_bit<T>, N>{}, x.v, i, overload_auto); } template <typename T, size_t N, size_t... indices> KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevectors(const vec<T, N>& x, const vec<T, N>& y, csizes_t<indices...> i) CMT_NOEXCEPT { - return intrinsics::simd_shuffle(intrinsics::simd_tag_v<T, N, N>, i, x.v, y.v); + return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, i, overload_auto); } namespace internal diff --git a/include/kfr/testo/comparison.hpp b/include/kfr/testo/comparison.hpp @@ -16,8 +16,6 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wexit-time-destructors") #endif CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpadded") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4018)) namespace testo { @@ -245,5 +243,4 @@ struct make_comparison }; } // namespace testo -CMT_PRAGMA_MSVC(warning(pop)) CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/tests/base_test.cpp b/tests/base_test.cpp @@ -54,11 +54,9 @@ TEST(test_basic) const vec<int, 2> two = concat(one, make_vector(42)); CHECK(two == vec<int, 2>{ 42, 42 }); -#if !defined CMT_COMPILER_IS_MSVC const vec<u8, 256> very_long_vector = repeat<64>(make_vector<u8>(1, 2, 4, 8)); CHECK(slice<0, 17>(very_long_vector) == vec<unsigned char, 17>{ 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1 }); -#endif // * ...really any: using big_vector = vec<i16, 107>; @@ -96,7 +94,6 @@ TEST(test_basic) TEST(ctti) { CHECK(cometa::type_name<float>() == std::string("float")); } - } // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -24,6 +24,74 @@ constexpr ctypes_t<float, double> dft_float_types{}; constexpr ctypes_t<float> dft_float_types{}; #endif +#if defined(CMT_ARCH_X86) + +static void full_barrier() +{ +#ifdef CMT_COMPILER_GNU + asm volatile("mfence" ::: "memory"); +#else + _ReadWriteBarrier(); +#endif +} +static CMT_NOINLINE void dont_optimize(const void* in) +{ +#ifdef CMT_COMPILER_GNU + asm volatile("" : "+m"(in)); +#else + volatile uint8_t a = *reinterpret_cast<const uint8_t*>(in); +#endif +} + +template <typename T> +static void perf_test_t(int size) +{ + print("[PERFORMANCE] DFT ", fmt<'s', 6>(type_name<T>()), " ", fmt<'d', 6>(size), "..."); + random_state gen1 = random_init(2247448713, 915890490, 864203735, 2982561); + random_state gen2 = random_init(2982561, 2247448713, 915890490, 864203735); + std::chrono::high_resolution_clock::duration duration(0); + dft_plan<T> dft(size); + univector<u8> tmp(dft.temp_size); + uint64_t counter = 0; + while (duration < std::chrono::seconds(1)) + { + univector<complex<T>> data(size); + data = make_complex(gen_random_range<T>(gen1, -1.0, +1.0), gen_random_range<T>(gen2, -1.0, +1.0)); + full_barrier(); + auto start = std::chrono::high_resolution_clock::now(); + dft.execute(data, data, tmp); + + full_barrier(); + duration += std::chrono::high_resolution_clock::now() - start; + dont_optimize(data.data()); + ++counter; + } + double opspersecond = counter / (std::chrono::nanoseconds(duration).count() / 1'000'000'000.0); + println(" ", fmt<'f', 12, 1>(opspersecond), " ops/second"); +} + +static void perf_test(int size) +{ + perf_test_t<float>(size); + perf_test_t<double>(size); +} + +TEST(test_performance) +{ + for (int size = 16; size <= 16384; size <<= 1) + { + perf_test(size); + } + +#ifndef KFR_DFT_NO_NPo2 + perf_test(210); + perf_test(3150); + perf_test(211); + perf_test(3163); +#endif +} +#endif + TEST(test_convolve) { univector<fbase, 5> a({ 1, 2, 3, 4, 5 }); @@ -201,94 +269,24 @@ TEST(dct) univector<u8> tmp(plan.temp_size); plan.execute(out, in, tmp, false); - univector<float, size> refout = { 120.f, -51.79283109806667f, 0.f, -5.6781471211595695f, - 0.f, -1.9843883778092053f, 0.f, -0.9603691873838152f, - 0.f, -0.5308329190495176f, 0.f, -0.3030379000702155f, - 0.f, -0.1584982220313824f, 0.f, -0.0494839805703826f }; + univector<float, size> refout = { 120., -51.79283109806667, 0., -5.6781471211595695, + 0., -1.9843883778092053, 0., -0.9603691873838152, + 0., -0.5308329190495176, 0., -0.3030379000702155, + 0., -0.1584982220313824, 0., -0.0494839805703826 }; CHECK(rms(refout - out) < 0.00001f); plan.execute(outinv, in, tmp, true); - univector<float, size> refoutinv = { 59.00747544192212f, -65.54341437693878f, 27.70332758523579f, - -24.56124678824279f, 15.546989102481612f, -14.293082621965974f, - 10.08224348063459f, -9.38097406470581f, 6.795411054455922f, - -6.320715753372687f, 4.455202292297903f, -4.0896421269390455f, - 2.580439536964837f, -2.2695816108369176f, 0.9311870090070382f, - -0.643618159997807f }; + univector<float, size> refoutinv = { 59.00747544192212, -65.54341437693878, 27.70332758523579, + -24.56124678824279, 15.546989102481612, -14.293082621965974, + 10.08224348063459, -9.38097406470581, 6.795411054455922, + -6.320715753372687, 4.455202292297903, -4.0896421269390455, + 2.580439536964837, -2.2695816108369176, 0.9311870090070382, + -0.643618159997807 }; CHECK(rms(refoutinv - outinv) < 0.00001f); } - - -#if defined(CMT_ARCH_X86) - -static void full_barrier() -{ -#ifdef CMT_COMPILER_GNU - asm volatile("mfence" ::: "memory"); -#else - _ReadWriteBarrier(); -#endif -} -static CMT_NOINLINE void dont_optimize(const void* in) -{ -#ifdef CMT_COMPILER_GNU - asm volatile("" : "+m"(in)); -#else - volatile uint8_t a = *reinterpret_cast<const uint8_t*>(in); -#endif -} - -template <typename T> -static void perf_test_t(int size) -{ - print("[PERFORMANCE] DFT ", fmt<'s', 6>(type_name<T>()), " ", fmt<'d', 6>(size), "..."); - random_state gen1 = random_init(2247448713, 915890490, 864203735, 2982561); - random_state gen2 = random_init(2982561, 2247448713, 915890490, 864203735); - std::chrono::high_resolution_clock::duration duration(0); - dft_plan<T> dft(size); - univector<u8> tmp(dft.temp_size); - uint64_t counter = 0; - while (duration < std::chrono::seconds(1)) - { - univector<complex<T>> data(size); - data = make_complex(gen_random_range<T>(gen1, -1.0, +1.0), gen_random_range<T>(gen2, -1.0, +1.0)); - full_barrier(); - auto start = std::chrono::high_resolution_clock::now(); - dft.execute(data, data, tmp); - - full_barrier(); - duration += std::chrono::high_resolution_clock::now() - start; - dont_optimize(data.data()); - ++counter; - } - double opspersecond = counter / (std::chrono::nanoseconds(duration).count() / 1'000'000'000.0); - println(" ", fmt<'f', 12, 1>(opspersecond), " ops/second"); -} - -static void perf_test(int size) -{ - perf_test_t<float>(size); - perf_test_t<double>(size); -} - -TEST(test_performance) -{ - for (int size = 16; size <= 16384; size <<= 1) - { - perf_test(size); - } - -#ifndef KFR_DFT_NO_NPo2 - perf_test(210); - perf_test(3150); - perf_test(211); - perf_test(3163); -#endif -} -#endif - } // namespace CMT_ARCH_NAME #ifndef KFR_NO_MAIN diff --git a/tests/unit/dsp/window.cpp b/tests/unit/dsp/window.cpp @@ -10,10 +10,6 @@ #include <kfr/dsp/window.hpp> #include <kfr/io/tostring.hpp> -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4305)) -CMT_PRAGMA_MSVC(warning(disable : 4244)) - namespace kfr { inline namespace CMT_ARCH_NAME @@ -191,5 +187,3 @@ TEST(window) } } // namespace CMT_ARCH_NAME } // namespace kfr - -CMT_PRAGMA_MSVC(warning(pop))