kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit a5251827c16beec42b386b5e78d0663ae37639d8
parent 6feabe6fffbd44c16a643159e0dc87bcc8f2282f
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Fri, 10 Nov 2023 08:44:48 +0000

DFT performance improvements

Diffstat:
MCMakeLists.txt | 6+++---
Minclude/kfr/dft/fft.hpp | 138++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Minclude/kfr/dft/impl/bitrev.hpp | 81++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Minclude/kfr/dft/impl/dft-fft.hpp | 15++++++++++-----
Minclude/kfr/dft/impl/dft-impl.hpp | 6+++++-
Minclude/kfr/dft/impl/fft-impl.hpp | 1110++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Minclude/kfr/dft/impl/ft.hpp | 85+++++++++++++++++++++++++++++++++++++++----------------------------------------
Minclude/kfr/simd/digitreverse.hpp | 36++++++++++++++++++++++--------------
Minclude/kfr/simd/impl/specializations.hpp | 63+++++++++++++++++++++++++++++++--------------------------------
Mtests/dft_test.cpp | 22+++++++++++++++-------
10 files changed, 1250 insertions(+), 312 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -163,8 +163,8 @@ if (APPLE) endif () if (NOT IOS) if (NOT MSVC OR CLANG) - target_compile_options(kfr - INTERFACE "${CLANG_ARG_PREFIX}-mstackrealign") + # target_compile_options(kfr + # INTERFACE "${CLANG_ARG_PREFIX}-mstackrealign") endif () endif () if (MSVC) @@ -221,7 +221,7 @@ endfunction () if (KFR_ENABLE_DFT) - set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast") + set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast -Xclang -O3 -mllvm -x86-use-vzeroupper=0") if (KFR_ENABLE_DFT_MULTIARCH) add_library(kfr_dft INTERFACE) diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp @@ -34,6 +34,7 @@ #include "../simd/constants.hpp" #include "../simd/read_write.hpp" #include "../simd/vec.hpp" +#include <bitset> CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wshadow") @@ -49,6 +50,8 @@ CMT_PRAGMA_MSVC(warning(disable : 4100)) namespace kfr { +#define DFT_MAX_STAGES 32 + using cdirect_t = cfalse_t; using cinvert_t = ctrue_t; @@ -67,17 +70,14 @@ struct dft_stage const char* name = nullptr; bool recursion = false; bool can_inplace = true; - bool inplace = false; - bool to_scratch = false; bool need_reorder = true; void initialize(size_t size) { do_initialize(size); } virtual void dump() const { - printf("%s: %zu, %zu, %zu, %zu, %zu, %zu, %zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix, - stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace, - to_scratch); + printf("%s: %zu, %zu, %zu, %zu, %zu, %zu, %zu, %d, %d\n", name ? name : "unnamed", radix, stage_size, + data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace); } KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) @@ -173,7 +173,7 @@ struct dft_plan void dump() const { - for (const std::unique_ptr<dft_stage<T>>& s : stages) + for (const std::unique_ptr<dft_stage<T>>& s : all_stages) { s->dump(); } @@ -213,8 +213,89 @@ struct dft_plan autofree<u8> data; size_t data_size; - std::vector<dft_stage_ptr<T>> stages; + + std::vector<dft_stage_ptr<T>> all_stages; + std::array<std::vector<dft_stage<T>*>, 2> stages; bool arblen; + using bitset = std::bitset<DFT_MAX_STAGES>; + std::array<bitset, 2> disposition_inplace; + std::array<bitset, 2> disposition_outofplace; + + void calc_disposition() + { + for (bool inverse : { false, true }) + { + auto&& stages = this->stages[inverse]; + bitset can_inplace_per_stage; + for (int i = 0; i < stages.size(); ++i) + { + can_inplace_per_stage[i] = stages[i]->can_inplace; + } + + disposition_inplace[static_cast<int>(inverse)] = + precompute_disposition(stages.size(), can_inplace_per_stage, true); + disposition_outofplace[static_cast<int>(inverse)] = + precompute_disposition(stages.size(), can_inplace_per_stage, false); + } + } + + static bitset precompute_disposition(int num_stages, bitset can_inplace_per_stage, bool inplace_requested) + { + static bitset even{ 0x5555555555555555ull }; + bitset mask = ~bitset() >> (DFT_MAX_STAGES - num_stages); + bitset result; + // disposition indicates where is input for corresponding stage + // first bit : 0 - input, 1 - scratch + // other bits: 0 - output, 1 - scratch + + // build disposition that works always + if (num_stages % 2 == 0) + { // even + result = ~even & mask; + } + else + { // odd + result = even & mask; + } + + int num_inplace = can_inplace_per_stage.count(); + +#ifdef KFR_DFT_ELIMINATE_MEMCPY + if (num_inplace > 0 && inplace_requested) + { + if (result.test(0)) // input is in scratch + { + // num_inplace must be odd + if (num_inplace % 2 == 0) + --num_inplace; + } + else + { + // num_inplace must be even + if (num_inplace % 2 != 0) + --num_inplace; + } + } +#endif + if (num_inplace > 0) + { + for (int i = num_stages - 1; i >= 0; --i) + { + if (can_inplace_per_stage.test(i)) + { + result ^= ~bitset() >> (DFT_MAX_STAGES - (i + 1)); + + if (--num_inplace == 0) + break; + } + } + } + + if (!inplace_requested) // out-of-place first stage; IN->OUT + result.reset(0); + + return result; + } protected: struct noinit @@ -224,32 +305,35 @@ protected: : size(size), temp_size(0), data_size(0), arblen(false) { } - const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, - const complex<T>* scratch, bool in_scratch) const + const complex<T>* select_in(bitset disposition, size_t stage, const complex<T>* out, const complex<T>* in, + const complex<T>* scratch) const { - if (stage == 0) - return in_scratch ? scratch : in; - return stages[stage - 1]->to_scratch ? scratch : out; + return disposition.test(stage) ? scratch : stage == 0 ? in : out; } - complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const + complex<T>* select_out(bitset disposition, size_t stage, size_t total_stages, complex<T>* out, + complex<T>* scratch) const { - return stages[stage]->to_scratch ? scratch : out; + return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out; } template <bool inverse> void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const { + auto&& stages = this->stages[inverse]; if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) { return stages[0]->execute(cbool<inverse>, out, in, temp); } - size_t stack[32] = { 0 }; + size_t stack[DFT_MAX_STAGES] = { 0 }; + + bitset disposition = + in == out ? this->disposition_inplace[inverse] : this->disposition_outofplace[inverse]; complex<T>* scratch = ptr_cast<complex<T>>( temp + this->temp_size - align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment)); - bool in_scratch = !stages[0]->can_inplace && in == out; + bool in_scratch = disposition.test(0); if (in_scratch) { builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size); @@ -273,8 +357,8 @@ protected: } else { - complex<T>* rout = select_out(rdepth, out, scratch); - const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch); + complex<T>* rout = select_out(disposition, rdepth, stages.size(), out, scratch); + const complex<T>* rin = select_in(disposition, rdepth, out, in, scratch); stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); offset += stages[rdepth]->out_offset; stack[rdepth]++; @@ -291,8 +375,9 @@ protected: size_t offset = 0; while (offset < this->size) { - stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch) + offset, - select_in(depth, out, in, scratch, in_scratch) + offset, temp); + stages[depth]->execute( + cbool<inverse>, select_out(disposition, depth, stages.size(), out, scratch) + offset, + select_in(disposition, depth, out, in, scratch) + offset, temp); offset += stages[depth]->stage_size; } depth++; @@ -306,7 +391,6 @@ struct dft_plan_real : dft_plan<T> { size_t size; dft_pack_format fmt; - dft_stage_ptr<T> fmt_stage; explicit dft_plan_real(cpu_t cpu, size_t size, dft_pack_format fmt = dft_pack_format::CCs) : dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt) @@ -360,13 +444,10 @@ struct dft_plan_real : dft_plan<T> KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp, cdirect_t = {}) const { this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp); - fmt_stage->execute(cfalse, out, out, nullptr); } KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp, cinvert_t = {}) const { - complex<T>* outdata = ptr_cast<complex<T>>(out); - fmt_stage->execute(ctrue, outdata, in, nullptr); - this->execute_dft(ctrue, outdata, outdata, temp); + this->execute_dft(ctrue, ptr_cast<complex<T>>(out), in, temp); } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> @@ -374,15 +455,12 @@ struct dft_plan_real : dft_plan<T> univector<u8, Tag3>& temp, cdirect_t = {}) const { this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data()); - fmt_stage->execute(cfalse, out.data(), out.data(), nullptr); } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in, univector<u8, Tag3>& temp, cinvert_t = {}) const { - complex<T>* outdata = ptr_cast<complex<T>>(out.data()); - fmt_stage->execute(ctrue, outdata, in.data(), nullptr); - this->execute_dft(ctrue, outdata, outdata, temp.data()); + this->execute_dft(ctrue, ptr_cast<complex<T>>(out.data()), in.data(), temp.data()); } // Deprecated. fmt must be passed to constructor instead @@ -434,7 +512,7 @@ struct dct_plan : dft_plan<T> } else { - mirrored = make_complex(make_univector(in, size) * cos(t), make_univector(in, size) * -sin(t)); + mirrored = make_complex(make_univector(in, size) * cos(t), make_univector(in, size) * -sin(t)); mirrored[0] = mirrored[0] * T(0.5); dft_plan<T>::execute(mirrored_dft.data(), mirrored.data(), temp, cfalse); for (size_t i = 0; i < halfSize; i++) diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp @@ -58,7 +58,12 @@ CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x) CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits) { if (bits > bitrev_table_log2N) - return bitreverse<32>(x) >> (32 - bits); + { + if (bits <= 16) + return bitreverse<16>(x) >> (16 - bits); + else + return bitreverse<32>(x) >> (32 - bits); + } return data::bitrev_table[x] >> (bitrev_table_log2N - bits); } @@ -66,7 +71,12 @@ CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits) CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits) { if (bits > bitrev_table_log2N) - return digitreverse4<32>(x) >> (32 - bits); + { + if (bits <= 16) + return digitreverse4<16>(x) >> (16 - bits); + else + return digitreverse4<32>(x) >> (32 - bits); + } x = data::bitrev_table[x]; x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); @@ -218,7 +228,7 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>) } template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>) +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */) { constexpr size_t bitrev = 4; fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4); @@ -232,6 +242,20 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>) } template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */) +{ + constexpr size_t bitrev = 2; + fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4); + fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4); + fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4); + fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); + fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4); + fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4); + fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4); + fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4); +} + +template <typename T> KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>) { constexpr size_t bitrev = 2; @@ -253,6 +277,44 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>) fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4); } +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */) +{ + constexpr size_t bitrev = 2; + fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4); + fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4); + fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4); + fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4); + fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4); + fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4); + fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4); + fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4); + fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4); + fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4); + fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4); + fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4); + fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4); + fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4); + fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4); + fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4); + fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4); + fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4); + fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4); + fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4); + fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4); + fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4); + fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4); + fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4); + fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4); + fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4); + fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4); + fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4); + fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4); + fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4); + fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4); + fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4); +} + template <typename T, bool use_br2> KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) { @@ -285,22 +347,35 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) { size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; if (i >= j) + { fft_reorder_swap_n4(io, i, j, N4, use_br2); + } + else + { + i += 4 * istep; + continue; + } i += istep; j = j + jstep1; if (i >= j) + { fft_reorder_swap_n4(io, i, j, N4, use_br2); + } i += istep; j = j - jstep2; if (i >= j) + { fft_reorder_swap_n4(io, i, j, N4, use_br2); + } i += istep; j = j + jstep1; if (i >= j) + { fft_reorder_swap_n4(io, i, j, N4, use_br2); + } i += istep; } } diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp @@ -94,14 +94,19 @@ CMT_PRAGMA_GNU(GCC diagnostic push) CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") #endif -template <typename Stage, typename T, typename... Args> -void add_stage(dft_plan<T>* self, Args... args) +template <typename Stage, bool add_stages = true, typename T, typename... Args> +void add_stage(dft_plan<T>* plan, Args... args) { dft_stage<T>* stage = new Stage(args...); stage->need_reorder = true; - self->data_size += stage->data_size; - self->temp_size += stage->temp_size; - self->stages.push_back(dft_stage_ptr<T>(stage)); + plan->data_size += stage->data_size; + plan->temp_size += stage->temp_size; + plan->all_stages.push_back(dft_stage_ptr<T>(stage)); + if constexpr (add_stages) + { + plan->stages[0].push_back(stage); + plan->stages[1].push_back(stage); + } } } // namespace CMT_ARCH_NAME diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp @@ -515,9 +515,11 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order) } }); + int num_stages = 0; if (cur_size >= 101) { add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size); + ++num_stages; self->arblen = true; } else @@ -535,6 +537,7 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order) prepare_dft_stage(self, r, iterations, blocks, ctrue); else prepare_dft_stage(self, r, iterations, blocks, cfalse); + ++num_stages; blocks *= r; } } @@ -547,9 +550,10 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order) prepare_dft_stage(self, cur_size, iterations, blocks, ctrue); else prepare_dft_stage(self, cur_size, iterations, blocks, cfalse); + ++num_stages; } - if (self->stages.size() > 2) + if (num_stages > 2) add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size); } } diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp @@ -46,6 +46,57 @@ namespace kfr inline namespace CMT_ARCH_NAME { +constexpr bool inline always_br2 = true; +constexpr bool inline use_autosort = false; + +#define KFR_AUTOSORT_FOR_128D +#define KFR_AUTOSORT_FOR_256D +#define KFR_AUTOSORT_FOR_512 +#define KFR_AUTOSORT_FOR_1024 + +#ifdef CMT_ARCH_AVX +template <> +KFR_INTRINSIC vec<float, 32> ctranspose<4, float, 32>(const vec<float, 32>& v16) +{ + cvec<float, 4> r0, r1, r2, r3; + split(v16, r0, r1, r2, r3); + const __m256d t0 = _mm256_unpacklo_pd(r0.v, r1.v); + const __m256d t1 = _mm256_unpacklo_pd(r2.v, r3.v); + const __m256d t2 = _mm256_unpackhi_pd(r0.v, r1.v); + const __m256d t3 = _mm256_unpackhi_pd(r2.v, r3.v); + r0.v = _mm256_permute2f128_pd(t0, t1, 0x20); + r1.v = _mm256_permute2f128_pd(t2, t3, 0x20); + r2.v = _mm256_permute2f128_pd(t0, t1, 0x31); + r3.v = _mm256_permute2f128_pd(t2, t3, 0x31); + return concat(r0, r1, r2, r3); +} +#endif + +template <> +KFR_INTRINSIC vec<float, 64> ctranspose<8, float, 64>(const vec<float, 64>& v32) +{ + cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7; + split(v32, a0, a1, a2, a3, a4, a5, a6, a7); + cvec<float, 16> even = concat(a0, a2, a4, a6); + cvec<float, 16> odd = concat(a1, a3, a5, a7); + even = ctranspose<4>(even); + odd = ctranspose<4>(odd); + return concat(even, odd); +} + +template <> +KFR_INTRINSIC vec<float, 64> ctranspose<4, float, 64>(const vec<float, 64>& v32) +{ + cvec<float, 16> lo, hi; + split(v32, lo, hi); + lo = ctranspose<4>(lo); + hi = ctranspose<4>(hi); + cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7; + split(lo, a0, a1, a2, a3); + split(hi, a4, a5, a6, a7); + return concat(a0, a4, a1, a5, a2, a6, a3, a7); +} + namespace intrinsics { @@ -59,8 +110,13 @@ KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*spl ww = swap<2>(ww); if constexpr (inverse) - tw_ = -(tw_); - ww = subadd(b1, ww * dupodd(tw_)); + { + ww = addsub(b1, ww * dupodd(tw_)); + } + else + { + ww = subadd(b1, ww * dupodd(tw_)); + } return ww; } @@ -134,18 +190,18 @@ KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splito { const size_t N4 = N / 4; cvec<T, width> w1, w2, w3; - constexpr bool read_split = !splitin && splitout; - constexpr bool write_split = splitin && !splitout; + constexpr bool read_split = !splitin; + constexpr bool write_split = !splitout; vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3; split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0); - split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1); split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2); - split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3); - const vec<T, width> sum02re = re0 + re2; const vec<T, width> sum02im = im0 + im2; + + split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1); + split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3); const vec<T, width> sum13re = re1 + re3; const vec<T, width> sum13im = im1 + im3; @@ -254,19 +310,38 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s #endif #endif -template <typename T> +template <size_t size = 1, typename T> KFR_INTRINSIC void prefetch_one(const complex<T>* in) { KFR_PREFETCH(in); + if constexpr (sizeof(complex<T>) * size > 64) + KFR_PREFETCH(in + 64); + if constexpr (sizeof(complex<T>) * size > 128) + KFR_PREFETCH(in + 128); + if constexpr (sizeof(complex<T>) * size > 192) + KFR_PREFETCH(in + 192); } -template <typename T> +template <size_t size = 1, typename T> KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in) { - KFR_PREFETCH(in); - KFR_PREFETCH(in + stride); - KFR_PREFETCH(in + stride * 2); - KFR_PREFETCH(in + stride * 3); + prefetch_one<size>(in); + prefetch_one<size>(in + stride); + prefetch_one<size>(in + stride * 2); + prefetch_one<size>(in + stride * 3); +} + +template <size_t size = 1, typename T> +KFR_INTRINSIC void prefetch_eight(size_t stride, const complex<T>* in) +{ + prefetch_one<size>(in); + prefetch_one<size>(in + stride); + prefetch_one<size>(in + stride * 2); + prefetch_one<size>(in + stride * 3); + prefetch_one<size>(in + stride * 4); + prefetch_one<size>(in + stride * 5); + prefetch_one<size>(in + stride * 6); + prefetch_one<size>(in + stride * 7); } template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, @@ -277,7 +352,7 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool const complex<T>*& twiddle) { static_assert(width > 0, "width cannot be zero"); - constexpr static size_t prefetch_offset = width * 8; + constexpr static size_t prefetch_cycles = 8; const auto N4 = N / csize_t<4>(); const auto N43 = N4 * csize_t<3>(); CMT_ASSUME(blocks > 0); @@ -286,16 +361,17 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool DFT_ASSERT(width <= N4); CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++) { - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (size_t n2 = 0; n2 < N4; n2 += width) + CMT_LOOP_NOUNROLL + for (size_t n2 = 0; n2 < N4;) { if constexpr (prefetch) - prefetch_four(N4, in + prefetch_offset); + prefetch_four<width>(N4, in + width * prefetch_cycles); radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(), cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle + n2 * 3); in += width; out += width; + n2 += width; } in += N43; out += N43; @@ -304,41 +380,314 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool return {}; } -template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> -KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>, - cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, - complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) +template <size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, bool inverse, typename T> +KFR_INTRINSIC void radix2_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>, + cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, + cbool_t<inverse>, complex<T>* out, const complex<T>* in, + const complex<T>*& twiddle) + { - CMT_ASSUME(blocks > 0); - constexpr static size_t prefetch_offset = 32 * 4; - for (size_t b = 0; b < blocks; b++) + for (size_t n = 0; n < stride; n++) + { + const cvec<T, 1> a = cread<1>(in + n); + const cvec<T, 1> b = cread<1>(in + n + stride); + cwrite<1>(out + n, a + b); + cwrite<1>(out + n + stride, a - b); + } +} + +template <size_t N, bool inverse, bool split_format, typename T> +KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, + cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) +{ + if constexpr (split_format) + { + const cvec<T, N> sum02 = a0 + a2; + const cvec<T, N> diff02 = a0 - a2; + const cvec<T, N> sum13 = a1 + a3; + cvec<T, N> diff13 = a1 - a3; + vec<T, N> diff13re, diff13im, diff02re, diff02im; + split(diff02, diff02re, diff02im); + split(diff13, diff13re, diff13im); + w0 = sum02 + sum13; + w2 = sum02 - sum13; + + (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); + (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); + } + else + { + const cvec<T, N> sum02 = a0 + a2; + const cvec<T, N> diff02 = a0 - a2; + const cvec<T, N> sum13 = a1 + a3; + cvec<T, N> diff13 = swap<2>(a1 - a3); + if constexpr (inverse) + diff13 = negodd(diff13); + else + diff13 = negeven(diff13); + w0 = sum02 + sum13; + w2 = sum02 - sum13; + w1 = diff02 - diff13; + w3 = diff02 + diff13; + } +} + +template <size_t N, bool inverse, bool split_format, typename T> +KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, + cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, + cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3) +{ + if constexpr (split_format) + { + const cvec<T, N> sum02 = a0 + a2; + const cvec<T, N> diff02 = a0 - a2; + const cvec<T, N> sum13 = a1 + a3; + cvec<T, N> diff13 = a1 - a3; + vec<T, N> diff13re, diff13im, diff02re, diff02im; + split(diff02, diff02re, diff02im); + split(diff13, diff13re, diff13im); + + w0 = sum02 + sum13; + w2 = sum02 - sum13; + (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); + (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); + + w2 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w2, tw2); + w1 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w1, tw1); + w3 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w3, tw3); + } + else + { + const cvec<T, N> sum02 = a0 + a2; + const cvec<T, N> diff02 = a0 - a2; + const cvec<T, N> sum13 = a1 + a3; + cvec<T, N> diff13 = swap<2>(a1 - a3); + if constexpr (inverse) + diff13 = negodd(diff13); + else + diff13 = negeven(diff13); + + w0 = sum02 + sum13; + w2 = sum02 - sum13; + w1 = diff02 - diff13; + w3 = diff02 + diff13; + w2 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w2, tw2); + w1 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w1, tw1); + w3 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w3, tw3); + } +} + +template <size_t N, bool split_format, typename T> +KFR_INTRINSIC cvec<T, N> read_twiddle(const complex<T>* tw) +{ + if constexpr (split_format) + { + return concat(repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw))), + repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw) + 1))); + } + else + { + return repeat<N>(cread<1>(tw)); + } +} + +template <size_t width_, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> +KFR_INTRINSIC void radix4_autosort_pass_first(size_t N, csize_t<width_>, cbool_t<splitout>, cbool_t<splitin>, + cbool_t<prefetch>, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, const complex<T>*& twiddle) +{ + static_assert(width_ > 0, "width cannot be zero"); + const size_t N4 = N / 4; + const size_t Nstride = N4; + constexpr size_t width = width_; + constexpr bool split = splitin || splitout; + static_assert(!split); + constexpr static size_t prefetch_cycles = 8; + + // CMT_LOOP_NOUNROLL + for (size_t b = 0; b < N4; b += width) { if constexpr (prefetch) - prefetch_four(csize_t<64>(), out + prefetch_offset); - cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; - split(cread_split<8, aligned, splitin>(out + 0), w0, w1); - split(cread_split<8, aligned, splitin>(out + 8), w2, w3); - split(cread_split<8, aligned, splitin>(out + 16), w4, w5); - split(cread_split<8, aligned, splitin>(out + 24), w6, w7); - - butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); - - w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); - w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); - w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); - w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); - w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); - w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); - w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); - - cvec<T, 8> z0, z1, z2, z3; - transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); - - butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); - cwrite<32, aligned>(out, bitreverse<2>(concat(concat(z0, z1), concat(z2, z3)))); - out += 32; + prefetch_four<width>(Nstride, in + prefetch_cycles * width); + cvec<T, width> tw1 = cread<width>(twiddle); + cvec<T, width> tw2 = cread<width>(twiddle + width); + cvec<T, width> tw3 = cread<width>(twiddle + 2 * width); + + const cvec<T, width> a0 = cread<width>(in + 0 * Nstride); + const cvec<T, width> a1 = cread<width>(in + 1 * Nstride); + const cvec<T, width> a2 = cread<width>(in + 2 * Nstride); + const cvec<T, width> a3 = cread<width>(in + 3 * Nstride); + cvec<T, width> w0, w1, w2, w3; + radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3); + cvec<T, 4 * width> w0123 = concat(w0, w1, w2, w3); + w0123 = ctranspose<width>(w0123); + cwrite<4 * width>(out, w0123); + twiddle += 3 * width; + in += width; + out += 4 * width; + } +} + +template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> +KFR_INTRINSIC void radix4_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>, + cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>, + complex<T>* out, const complex<T>* in, + const complex<T>*& twiddle) +{ + static_assert(width > 0, "width cannot be zero"); + constexpr static size_t prefetch_cycles = 8; + constexpr bool split = splitin || splitout; + constexpr bool read_split = !splitin && split; + constexpr bool write_split = !splitout && split; + static_assert(!splitout); + + CMT_PRAGMA_CLANG(clang loop unroll_count(4)) + for (size_t n = 0; n < stride; n += width) + { + if constexpr (prefetch) + prefetch_four<width>(stride, in + prefetch_cycles * width); + const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride); + const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride); + const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride); + const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride); + cvec<T, width> w0, w1, w2, w3; + radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3); + cwrite_split<width, false, write_split>(out + 0 * stride, w0); + cwrite_split<width, false, write_split>(out + 1 * stride, w1); + cwrite_split<width, false, write_split>(out + 2 * stride, w2); + cwrite_split<width, false, write_split>(out + 3 * stride, w3); + in += width; + out += width; } - return {}; +} + +template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> +KFR_INTRINSIC void radix8_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>, + cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>, + complex<T>* out, const complex<T>* in, + const complex<T>*& twiddle) +{ + static_assert(width > 0, "width cannot be zero"); + constexpr static size_t prefetch_cycles = 4; + constexpr bool split = splitin || splitout; + constexpr bool read_split = !splitin && split; + constexpr bool write_split = !splitout && split; + static_assert(!splitout); + + CMT_PRAGMA_CLANG(clang loop unroll_count(4)) + for (size_t n = 0; n < stride; n += width) + { + if constexpr (prefetch) + prefetch_eight<width>(stride, in + prefetch_cycles * width); + const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride); + const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride); + const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride); + const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride); + const cvec<T, width> a4 = cread_split<width, false, read_split>(in + 4 * stride); + const cvec<T, width> a5 = cread_split<width, false, read_split>(in + 5 * stride); + const cvec<T, width> a6 = cread_split<width, false, read_split>(in + 6 * stride); + const cvec<T, width> a7 = cread_split<width, false, read_split>(in + 7 * stride); + cvec<T, width> w0, w1, w2, w3, w4, w5, w6, w7; + butterfly8<width, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, w0, w1, w2, w3, w4, w5, w6, w7); + cwrite_split<width, false, write_split>(out + 0 * stride, w0); + cwrite_split<width, false, write_split>(out + 1 * stride, w1); + cwrite_split<width, false, write_split>(out + 2 * stride, w2); + cwrite_split<width, false, write_split>(out + 3 * stride, w3); + cwrite_split<width, false, write_split>(out + 4 * stride, w4); + cwrite_split<width, false, write_split>(out + 5 * stride, w5); + cwrite_split<width, false, write_split>(out + 6 * stride, w6); + cwrite_split<width, false, write_split>(out + 7 * stride, w7); + in += width; + out += width; + } +} + +template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> +KFR_INTRINSIC void radix4_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>, + cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>, + complex<T>* out, const complex<T>* in, const complex<T>*& twiddle) +{ + static_assert(width > 0, "width cannot be zero"); + constexpr static size_t prefetch_cycles = 8; + const size_t N4 = N / 4; + const size_t Nstride = stride * N4; + const size_t stride3 = 3 * stride; + constexpr bool split = splitin || splitout; + constexpr bool read_split = !splitin && split; + constexpr bool write_split = !splitout && split; + + { + for (size_t n = 0; n < stride; n += width) + { + if constexpr (prefetch) + prefetch_four<width>(Nstride, in + prefetch_cycles * width); + const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride); + const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride); + const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride); + const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride); + cvec<T, width> w0, w1, w2, w3; + radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3); + cwrite_split<width, false, write_split>(out + 0 * stride, w0); + cwrite_split<width, false, write_split>(out + 1 * stride, w1); + cwrite_split<width, false, write_split>(out + 2 * stride, w2); + cwrite_split<width, false, write_split>(out + 3 * stride, w3); + in += width; + out += width; + } + twiddle += 3; + out += stride3; + } + + // CMT_LOOP_NOUNROLL + for (size_t b = 1; b < N4; b++) + { + cvec<T, width> tw1 = read_twiddle<width, split>(twiddle); + cvec<T, width> tw2 = read_twiddle<width, split>(twiddle + 1); + cvec<T, width> tw3 = read_twiddle<width, split>(twiddle + 2); + for (size_t n = 0; n < stride; n += width) + { + if constexpr (prefetch) + prefetch_four<width>(Nstride, in + prefetch_cycles * width); + const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride); + const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride); + const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride); + const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride); + cvec<T, width> w0, w1, w2, w3; + radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3); + cwrite_split<width, false, write_split>(out + 0 * stride, w0); + cwrite_split<width, false, write_split>(out + 1 * stride, w1); + cwrite_split<width, false, write_split>(out + 2 * stride, w2); + cwrite_split<width, false, write_split>(out + 3 * stride, w3); + in += width; + out += width; + } + twiddle += 3; + out += stride3; + } +} + +template <typename T> +static void initialize_twiddle_autosort(size_t N, size_t w, complex<T>*& twiddle) +{ + for (size_t b = 0; b < N / 4; ++b) + { + cwrite<1>(twiddle + b / w * 3 * w + b % w + 0 * w, calculate_twiddle<T>(b, N)); + cwrite<1>(twiddle + b / w * 3 * w + b % w + 1 * w, calculate_twiddle<T>(2 * b, N)); + cwrite<1>(twiddle + b / w * 3 * w + b % w + 2 * w, calculate_twiddle<T>(3 * b, N)); + } + twiddle += N / 4 * 3; +} + +template <typename T> +KFR_INTRINSIC void interleavehalves32(cvec<T, 32>& v0) +{ + cvec<T, 8> t0, t1, t2, t3; + split(v0, t0, t1, t2, t3); + t0 = interleavehalves(t0); + t1 = interleavehalves(t1); + t2 = interleavehalves(t2); + t3 = interleavehalves(t3); + v0 = concat(t0, t1, t2, t3); } template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> @@ -347,22 +696,47 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfa complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) { CMT_ASSUME(blocks > 0); - DFT_ASSERT(2 <= blocks); - constexpr static size_t prefetch_offset = width * 16; - for (size_t b = 0; b < blocks; b += 2) + DFT_ASSERT(4 <= blocks); + constexpr static size_t prefetch_cycles = 8; + if constexpr (vector_capacity<T> >= 128) { - if constexpr (prefetch) - prefetch_one(out + prefetch_offset); - - cvec<T, 8> vlo = cread<8, aligned>(out + 0); - cvec<T, 8> vhi = cread<8, aligned>(out + 8); - butterfly8<inverse>(vlo); - butterfly8<inverse>(vhi); - vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo); - vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi); - cwrite<8, aligned>(out, vlo); - cwrite<8, aligned>(out + 8, vhi); - out += 16; + CMT_PRAGMA_CLANG(clang loop unroll(disable)) + for (size_t b = 0; b < blocks; b += 4) + { + if constexpr (prefetch) + prefetch_one<32>(out + prefetch_cycles * 32); + + cvec<T, 32> v32 = cread<32, aligned>(out); + cvec<T, 4> v0, v1, v2, v3, v4, v5, v6, v7; + v32 = ctranspose<8>(v32); + split(v32, v0, v1, v2, v3, v4, v5, v6, v7); + butterfly8<4, inverse>(v0, v1, v2, v3, v4, v5, v6, v7); + v32 = concat(v0, v4, v2, v6, v1, v5, v3, v7); + v32 = ctranspose<4>(v32); + cwrite<32, aligned>(out, v32); + + out += 32; + } + } + else + { + CMT_PRAGMA_CLANG(clang loop unroll(disable)) + for (size_t b = 0; b < blocks; b += 2) + { + if constexpr (prefetch) + prefetch_one<16>(out + prefetch_cycles * 16); + + cvec<T, 16> v16 = cread<16, aligned>(out); + cvec<T, 2> v0, v1, v2, v3, v4, v5, v6, v7; + v16 = ctranspose<8>(v16); + split(v16, v0, v1, v2, v3, v4, v5, v6, v7); + butterfly8<2, inverse>(v0, v1, v2, v3, v4, v5, v6, v7); + v16 = concat(v0, v4, v2, v6, v1, v5, v3, v7); + v16 = ctranspose<2>(v16); + cwrite<16, aligned>(out, v16); + + out += 16; + } } return {}; } @@ -373,28 +747,67 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cf complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) { CMT_ASSUME(blocks > 0); - constexpr static size_t prefetch_offset = width * 4; - DFT_ASSERT(2 <= blocks); - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (size_t b = 0; b < blocks; b += 2) + constexpr static size_t prefetch_cycles = 4; + DFT_ASSERT(4 <= blocks); + if constexpr (vector_capacity<T> >= 128) { - if constexpr (prefetch) - prefetch_one(out + prefetch_offset); - - cvec<T, 16> vlo = cread<16, aligned>(out); - cvec<T, 16> vhi = cread<16, aligned>(out + 16); - butterfly4<4, inverse>(vlo); - butterfly4<4, inverse>(vhi); - apply_twiddles4<0, 4, 4, inverse>(vlo); - apply_twiddles4<0, 4, 4, inverse>(vhi); - vlo = digitreverse4<2>(vlo); - vhi = digitreverse4<2>(vhi); - butterfly4<4, inverse>(vlo); - butterfly4<4, inverse>(vhi); - - use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo); - use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi); - out += 32; + CMT_PRAGMA_CLANG(clang loop unroll(disable)) + for (size_t b = 0; b < blocks; b += 4) + { + if constexpr (prefetch) + prefetch_one<64>(out + prefetch_cycles * 64); + + cvec<T, 16> v0 = cread<16, aligned>(out); + cvec<T, 16> v1 = cread<16, aligned>(out + 16); + cvec<T, 16> v2 = cread<16, aligned>(out + 32); + cvec<T, 16> v3 = cread<16, aligned>(out + 48); + butterfly4_packed<4, inverse>(v0); + butterfly4_packed<4, inverse>(v1); + butterfly4_packed<4, inverse>(v2); + butterfly4_packed<4, inverse>(v3); + apply_twiddles4<0, 4, 4, inverse>(v0); + apply_twiddles4<0, 4, 4, inverse>(v1); + apply_twiddles4<0, 4, 4, inverse>(v2); + apply_twiddles4<0, 4, 4, inverse>(v3); + v0 = digitreverse4<2>(v0); + v1 = digitreverse4<2>(v1); + v2 = digitreverse4<2>(v2); + v3 = digitreverse4<2>(v3); + butterfly4_packed<4, inverse>(v0); + butterfly4_packed<4, inverse>(v1); + butterfly4_packed<4, inverse>(v2); + butterfly4_packed<4, inverse>(v3); + + use_br2 ? cbitreverse_write(out, v0) : cdigitreverse4_write(out, v0); + use_br2 ? cbitreverse_write(out + 16, v1) : cdigitreverse4_write(out + 16, v1); + use_br2 ? cbitreverse_write(out + 32, v2) : cdigitreverse4_write(out + 32, v2); + use_br2 ? cbitreverse_write(out + 48, v3) : cdigitreverse4_write(out + 48, v3); + out += 64; + } + } + else + { + CMT_PRAGMA_CLANG(clang loop unroll(disable)) + for (size_t b = 0; b < blocks; b += 2) + { + if constexpr (prefetch) + prefetch_one<32>(out + prefetch_cycles * 32); + + cvec<T, 16> vlo = cread<16, aligned>(out); + cvec<T, 16> vhi = cread<16, aligned>(out + 16); + butterfly4_packed<4, inverse>(vlo); + butterfly4_packed<4, inverse>(vhi); + apply_twiddles4<0, 4, 4, inverse>(vlo); + apply_twiddles4<0, 4, 4, inverse>(vhi); + vlo = digitreverse4<2>(vlo); + vhi = digitreverse4<2>(vhi); + butterfly4_packed<4, inverse>(vlo); + butterfly4_packed<4, inverse>(vhi); + + use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo); + use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi); + out += 32; + } } return {}; } @@ -404,16 +817,18 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfa cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) { - constexpr static size_t prefetch_offset = width * 4; + constexpr static size_t prefetch_cycles = 8; CMT_ASSUME(blocks > 8); DFT_ASSERT(8 <= blocks); for (size_t b = 0; b < blocks; b += 4) { if constexpr (prefetch) - prefetch_one(out + prefetch_offset); + prefetch_one<16>(out + prefetch_cycles * 16); cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out); - butterfly4<4, inverse>(v16); + butterfly4_packed<4, inverse>(v16); + if constexpr (use_br2) + v16 = permutegroups<(8), 0, 2, 1, 3>(v16); cdigitreverse4_write<aligned>(out, v16); out += 4 * 4; @@ -447,6 +862,7 @@ struct fft_stage_impl : dft_stage<T> constexpr static bool prefetch = fft_config<T>::prefetch; constexpr static bool aligned = false; constexpr static size_t width = fft_config<T>::process_width; + constexpr static bool use_br2 = !is_even || always_br2; virtual void do_initialize(size_t size) override final { @@ -464,7 +880,7 @@ struct fft_stage_impl : dft_stage<T> const size_t stg_size = this->stage_size; CMT_ASSUME(stg_size >= 2048); CMT_ASSUME(stg_size % 2048 == 0); - radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(), + radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); } }; @@ -485,12 +901,18 @@ struct fft_final_stage_impl : dft_stage<T> constexpr static size_t width = fft_config<T>::process_width; constexpr static bool is_even = cometa::is_even(ilog2(size)); - constexpr static bool use_br2 = !is_even; + constexpr static bool use_br2 = !is_even || always_br2; constexpr static bool aligned = false; constexpr static bool prefetch = fft_config<T>::prefetch && splitin; - KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {} - KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {} + template <bool pass_splitin> + KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cbool_t<pass_splitin>, complex<T>*&) + { + } + template <bool pass_splitin> + KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cbool_t<pass_splitin>, complex<T>*&) + { + } static constexpr bool get_pass_splitout(size_t N) { return N / 4 > 8 && N / 4 / 4 >= width; } @@ -518,33 +940,25 @@ struct fft_final_stage_impl : dft_stage<T> final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle); } - template <bool inverse, typename U = T, KFR_ENABLE_IF(std::is_same_v<U, float>)> - KFR_MEM_INTRINSIC void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) - { - radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), - cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); - } - - template <bool inverse, typename U = T, KFR_ENABLE_IF(std::is_same_v<U, float>)> - KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) + template <bool inverse, bool pass_splitin, typename U = T, KFR_ENABLE_IF(vector_capacity<U> >= 128)> + KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, + const complex<T>*, const complex<T>*& twiddle) { radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } - template <bool inverse> - KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) + template <bool inverse, bool pass_splitin> + KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, + const complex<T>*, const complex<T>*& twiddle) { radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); } - template <bool inverse> - KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*, - const complex<T>*& twiddle) + template <bool inverse, bool pass_splitin> + KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, + const complex<T>*, const complex<T>*& twiddle) { radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); @@ -583,7 +997,77 @@ struct fft_reorder_stage_impl : dft_stage<T> template <bool inverse> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*) { - fft_reorder(out, this->user, cbool_t<!is_even>()); + fft_reorder(out, this->user, cbool_t<(!is_even || always_br2)>()); + } +}; + +template <typename T, bool is_first, bool is_last, bool radix8> +struct fft_autosort_stage_impl : dft_stage<T> +{ + fft_autosort_stage_impl(size_t stage_size, size_t stride) + { + this->name = dft_name(this); + this->radix = radix8 ? 8 : 4; + this->stage_size = stage_size * stride * this->radix; + this->blocks = stage_size; + this->recursion = false; + this->can_inplace = is_last; + this->need_reorder = false; + this->user = stride; + if constexpr (!is_last) + { + this->data_size = + align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); + } + } + + constexpr static bool prefetch = fft_config<T>::prefetch; + constexpr static bool aligned = false; + + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t total_size) final + { + if constexpr (!is_last) + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + if constexpr (is_first) + initialize_twiddle_autosort(this->blocks, width, twiddle); + else + initialize_twiddle_autosort(this->blocks, 1, twiddle); + } + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + const size_t stg_size = this->blocks; + const size_t stride = this->user; + if constexpr (is_first) + { + radix4_autosort_pass_first(stg_size, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(), + cbool_t<inverse>(), out, in, twiddle); + } + else if constexpr (is_last) + { + if constexpr (radix8) + radix8_autosort_pass_last(stride, csize_t<width / 2>(), cfalse, cfalse, cbool_t<prefetch>(), + cbool_t<inverse>(), out, in, twiddle); + else + radix4_autosort_pass_last(stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(), + cbool_t<inverse>(), out, in, twiddle); + } + else + { + if (stride == 4) + radix4_autosort_pass(stg_size, stride, csize_t<4>(), cfalse, cfalse, cbool_t<prefetch>(), + cbool_t<inverse>(), out, in, twiddle); + else + radix4_autosort_pass(stg_size, stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(), + cbool_t<inverse>(), out, in, twiddle); + } } }; @@ -593,7 +1077,11 @@ struct fft_specialization; template <typename T> struct fft_specialization<T, 0> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 1; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN @@ -608,7 +1096,11 @@ struct fft_specialization<T, 0> : dft_stage<T> template <typename T> struct fft_specialization<T, 1> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 2; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN @@ -625,7 +1117,11 @@ struct fft_specialization<T, 1> : dft_stage<T> template <typename T> struct fft_specialization<T, 2> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 4; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN @@ -642,7 +1138,11 @@ struct fft_specialization<T, 2> : dft_stage<T> template <typename T> struct fft_specialization<T, 3> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 8; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN @@ -650,7 +1150,7 @@ struct fft_specialization<T, 3> : dft_stage<T> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { cvec<T, 8> v8 = cread<8, aligned>(in); - butterfly8<inverse>(v8); + butterfly8_packed<inverse>(v8); cwrite<8, aligned>(out, v8); } }; @@ -658,7 +1158,11 @@ struct fft_specialization<T, 3> : dft_stage<T> template <typename T> struct fft_specialization<T, 4> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 16; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN @@ -666,7 +1170,7 @@ struct fft_specialization<T, 4> : dft_stage<T> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { cvec<T, 16> v16 = cread<16, aligned>(in); - butterfly16<inverse>(v16); + butterfly16_packed<inverse>(v16); cwrite<16, aligned>(out, v16); } }; @@ -674,7 +1178,11 @@ struct fft_specialization<T, 4> : dft_stage<T> template <typename T> struct fft_specialization<T, 5> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 32; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN @@ -682,25 +1190,100 @@ struct fft_specialization<T, 5> : dft_stage<T> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { cvec<T, 32> v32 = cread<32, aligned>(in); - butterfly32<inverse>(v32); + butterfly32_packed<inverse>(v32); cwrite<32, aligned>(out, v32); } }; +#ifdef KFR_AUTOSORT_FOR_64 +template <typename T> +struct fft_specialization<T, 6> : dft_stage<T> +{ + fft_specialization(size_t stage_size) + { + this->stage_size = 64; + this->name = dft_name(this); + this->temp_size = 64 * sizeof(complex<T>); + this->data_size = 64 * sizeof(complex<T>); + } + + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t) final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddle_autosort(64, width, twiddle); + initialize_twiddle_autosort(16, 1, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + auto no = cfalse; + const complex<T>* tw = ptr_cast<complex<T>>(this->data); + complex<T>* scratch = ptr_cast<complex<T>>(temp); + radix4_autosort_pass_first(64, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); + radix4_autosort_pass(16, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw); + } +}; +#else template <typename T> struct fft_specialization<T, 6> : dft_stage<T> { - fft_specialization(size_t) { this->name = dft_name(this); } + fft_specialization(size_t) + { + this->stage_size = 64; + this->name = dft_name(this); + } constexpr static bool aligned = false; DFT_STAGE_FN template <bool inverse> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { - butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in); + butterfly64_memory(cbool_t<inverse>(), cbool_t<aligned>(), out, in); } }; +#endif +#ifdef KFR_AUTOSORT_FOR_128D +template <> +struct fft_specialization<double, 7> : dft_stage<double> +{ + using T = double; + fft_specialization(size_t stage_size) + { + this->stage_size = 128; + this->name = dft_name(this); + this->temp_size = 128 * sizeof(complex<T>); + this->data_size = 128 * sizeof(complex<T>); + } + + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t) final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddle_autosort(128, width, twiddle); + initialize_twiddle_autosort(32, 1, twiddle); + initialize_twiddle_autosort(8, 1, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + auto no = cfalse; + const complex<T>* tw = ptr_cast<complex<T>>(this->data); + complex<T>* scratch = ptr_cast<complex<T>>(temp); + radix4_autosort_pass_first(128, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); + radix4_autosort_pass(32, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); + radix8_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw); + } +}; +#else template <> struct fft_specialization<double, 7> : dft_stage<double> { @@ -741,6 +1324,7 @@ struct fft_specialization<double, 7> : dft_stage<double> fft_reorder(out, csize_t<7>()); } }; +#endif template <> struct fft_specialization<float, 7> : dft_stage<float> @@ -775,7 +1359,9 @@ struct fft_specialization<float, 7> : dft_stage<float> const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); - radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), + radix4_pass(32, 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), + cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); + radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); if (this->need_reorder) fft_reorder(out, csize_t<7>()); @@ -787,8 +1373,9 @@ struct fft_specialization<float, 8> : dft_stage<float> { fft_specialization(size_t) { - this->name = dft_name(this); - this->temp_size = sizeof(complex<float>) * 256; + this->stage_size = 256; + this->name = dft_name(this); + this->temp_size = sizeof(complex<float>) * 256; } using T = float; @@ -824,13 +1411,62 @@ struct fft_specialization<float, 8> : dft_stage<float> } }; +#ifdef KFR_AUTOSORT_FOR_256D + +template <> +struct fft_specialization<double, 8> : dft_stage<double> +{ + using T = double; + fft_specialization(size_t stage_size) + { + this->stage_size = 256; + this->name = dft_name(this); + this->temp_size = 256 * sizeof(complex<T>); + this->data_size = 256 * sizeof(complex<T>); + } + + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t) final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddle_autosort(256, width, twiddle); + initialize_twiddle_autosort(64, 1, twiddle); + initialize_twiddle_autosort(16, 1, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + auto no = cfalse; + const complex<T>* tw = ptr_cast<complex<T>>(this->data); + complex<T>* scratch = ptr_cast<complex<T>>(temp); + if (in != out) + { + radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, out, in, tw); + radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw); + radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, out, tw); + } + else + { + radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); + radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw); + radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); + } + } +}; +#else template <> struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256> { using T = double; fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size) { - this->name = dft_name(this); + this->stage_size = 256; + this->name = dft_name(this); } DFT_STAGE_FN @@ -839,16 +1475,56 @@ struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256> { fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr); if (this->need_reorder) - fft_reorder(out, csize_t<8>()); + fft_reorder(out, csize_t<8>(), cbool<always_br2>); } }; +#endif + +#ifdef KFR_AUTOSORT_FOR_512 + +template <typename T> +struct fft_specialization<T, 9> : dft_stage<T> +{ + fft_specialization(size_t stage_size) + { + this->stage_size = 512; + this->name = dft_name(this); + this->temp_size = 512 * sizeof(complex<T>); + this->data_size = 512 * sizeof(complex<T>); + } + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t) final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddle_autosort(512, width, twiddle); + initialize_twiddle_autosort(128, 1, twiddle); + initialize_twiddle_autosort(32, 1, twiddle); + initialize_twiddle_autosort(8, 1, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + auto no = cfalse; + const complex<T>* tw = ptr_cast<complex<T>>(this->data); + complex<T>* scratch = ptr_cast<complex<T>>(temp); + radix4_autosort_pass_first(512, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); + radix4_autosort_pass(128, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass(32, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw); + radix8_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); + } +}; +#else template <typename T> struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512> { fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size) { - this->name = dft_name(this); + this->stage_size = 512; + this->name = dft_name(this); } DFT_STAGE_FN @@ -860,7 +1536,47 @@ struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512> fft_reorder(out, csize_t<9>()); } }; +#endif +#ifdef KFR_AUTOSORT_FOR_1024 +template <typename T> +struct fft_specialization<T, 10> : dft_stage<T> +{ + fft_specialization(size_t stage_size) + { + this->stage_size = 1024; + this->name = dft_name(this); + this->temp_size = 1024 * sizeof(complex<T>); + this->data_size = 1024 * sizeof(complex<T>); + } + + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t) final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddle_autosort(1024, width, twiddle); + initialize_twiddle_autosort(256, 1, twiddle); + initialize_twiddle_autosort(64, 1, twiddle); + initialize_twiddle_autosort(16, 1, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + auto no = cfalse; + auto split = cfalse; + const complex<T>* tw = ptr_cast<complex<T>>(this->data); + complex<T>* scratch = ptr_cast<complex<T>>(temp); + radix4_autosort_pass_first(1024, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); + radix4_autosort_pass(256, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass(64, 16, csize<width>, split, no, no, cbool<inverse>, scratch, out, tw); + radix4_autosort_pass(16, 64, csize<width>, split, split, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass_last(256, csize<width>, no, split, no, cbool<inverse>, out, out, tw); + } +}; +#else template <typename T> struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> { @@ -878,23 +1594,86 @@ struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> fft_reorder(out, 10, cfalse); } }; +#endif + +#ifdef KFR_AUTOSORT_FOR_2048 +template <typename T> +struct fft_specialization<T, 11> : dft_stage<T> +{ + fft_specialization(size_t stage_size) + { + this->stage_size = 2048; + this->name = dft_name(this); + this->temp_size = 2048 * sizeof(complex<T>); + this->data_size = 2048 * sizeof(complex<T>); + } + + constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); + + void do_initialize(size_t) final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + initialize_twiddle_autosort(2048, width, twiddle); + initialize_twiddle_autosort(512, 1, twiddle); + initialize_twiddle_autosort(128, 1, twiddle); + initialize_twiddle_autosort(32, 1, twiddle); + initialize_twiddle_autosort(8, 1, twiddle); + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + auto no = cfalse; + const complex<T>* tw = ptr_cast<complex<T>>(this->data); + complex<T>* scratch = ptr_cast<complex<T>>(temp); + radix4_autosort_pass_first(2048, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); + radix4_autosort_pass(512, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); + radix4_autosort_pass(128, 16, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw); + radix4_autosort_pass(32, 64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); + radix8_autosort_pass_last(256, csize<width>, no, no, no, cbool<inverse>, out, out, tw); + } +}; +#endif } // namespace intrinsics template <bool is_even, bool first, typename T> void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>, cbool_t<first>) { - constexpr size_t final_size = is_even ? 1024 : 512; - - if (stage_size >= 2048) + if constexpr (use_autosort) { - add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size); - - make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse); + if (stage_size >= 16) + { + add_stage<intrinsics::fft_autosort_stage_impl<T, first, false, false>>(self, stage_size, + self->size / stage_size); + make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse); + } + else + { + if (stage_size == 8) + add_stage<intrinsics::fft_autosort_stage_impl<T, false, true, true>>(self, stage_size, + self->size / stage_size); + else + add_stage<intrinsics::fft_autosort_stage_impl<T, false, true, false>>( + self, stage_size, self->size / stage_size); + } } else { - add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size); + constexpr size_t final_size = is_even ? 1024 : 512; + + if (stage_size >= 2048) + { + add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size); + + make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse); + } + else + { + add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size); + add_stage<intrinsics::fft_reorder_stage_impl<T, is_even>>(self, self->size); + } } } @@ -935,7 +1714,7 @@ KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self) { self->data = autofree<u8>(self->data_size); size_t offset = 0; - for (dft_stage_ptr<T>& stage : self->stages) + for (dft_stage_ptr<T>& stage : self->all_stages) { initialize_data_stage(self, stage, offset); } @@ -945,21 +1724,10 @@ KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self) template <typename T> KFR_INTRINSIC void initialize_order(dft_plan<T>* self) { - bool to_scratch = false; - bool scratch_needed = false; - for (dft_stage_ptr<T>& stage : reversed(self->stages)) - { - if (to_scratch) - { - scratch_needed = true; - } - stage->to_scratch = to_scratch; - if (!stage->can_inplace) - { - to_scratch = !to_scratch; - } - } - if (scratch_needed || !self->stages[0]->can_inplace) + self->calc_disposition(); + typename dft_plan<T>::bitset ored = self->disposition_inplace[0] | self->disposition_inplace[1] | + self->disposition_outofplace[0] | self->disposition_outofplace[1]; + if (ored.any()) // if scratch needed self->temp_size += align_up(sizeof(complex<T>) * self->size, platform<>::native_cache_alignment); } @@ -975,15 +1743,8 @@ KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order) constexpr size_t log2nv = val_of(decltype(log2n)()); add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size); }, - [&]() - { - cswitch(cfalse_true, is_even(log2n), - [&](auto is_even) - { - make_fft(self, size, is_even, ctrue); - constexpr size_t is_evenv = val_of(decltype(is_even)()); - add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(self, size); - }); + [&]() { + cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { make_fft(self, size, is_even, ctrue); }); }); } @@ -1099,12 +1860,12 @@ void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const com const cvec<T, width> fpk = cread<width>(in + i); const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); - const cvec<T, width> f1k = fpk + fpnk; - const cvec<T, width> f2k = fpk - fpnk; - const cvec<T, width> t = cmul_conj(f2k, tw); - cwrite<width>(out + i, f1k + t); - cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t))); - }); + const cvec<T, width> f1k = fpk + fpnk; + const cvec<T, width> f2k = fpk - fpnk; + const cvec<T, width> t = cmul_conj(f2k, tw); + cwrite<width>(out + i, f1k + t); + cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t))); + }); if (is_even(csize)) { size_t k = csize / 2; @@ -1151,10 +1912,11 @@ struct dft_stage_real_repack : dft_stage<T> public: dft_stage_real_repack(size_t real_size, dft_pack_format fmt) { - this->user = static_cast<int>(fmt); - this->stage_size = real_size; + this->user = static_cast<int>(fmt); + this->stage_size = real_size; + this->can_inplace = true; const size_t count = (real_size / 2 + 1) / 2; - this->data_size = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment); + this->data_size = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment); } void do_initialize(size_t) override { @@ -1162,15 +1924,15 @@ public: constexpr size_t width = vector_width<T> * 2; size_t real_size = this->stage_size; complex<T>* rtwiddle = ptr_cast<complex<T>>(this->data); - const size_t count = (real_size / 2 + 1) / 2; + const size_t count = (real_size / 2 + 1) / 2; block_process(count, csizes_t<width, 1>(), [=](size_t i, auto w) { constexpr size_t width = val_of(decltype(w)()); cwrite<width>( rtwiddle + i, - cossin(dup(-constants<T>::pi * - ((enumerate<T, width>() + i + real_size / T(4)) / (real_size / 2))))); + cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + real_size / T(4)) / + (real_size / 2))))); }); } void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override @@ -1191,10 +1953,10 @@ void dft_real_initialize(dft_plan_real<T>& plan) if (plan.size == 0) return; initialize_stages(&plan); - plan.fmt_stage.reset(new dft_stage_real_repack<T>(plan.size, plan.fmt)); - plan.data_size += plan.fmt_stage->data_size; - size_t offset = initialize_data(&plan); - initialize_data_stage(&plan, plan.fmt_stage, offset); + add_stage<dft_stage_real_repack<T>, false>(&plan, plan.size, plan.fmt); + plan.stages[0].push_back(plan.all_stages.back().get()); + plan.stages[1].insert(plan.stages[1].begin(), plan.all_stages.back().get()); + initialize_data(&plan); initialize_order(&plan); } diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp @@ -37,7 +37,6 @@ #include "../../base/memory.hpp" #include "../data/sincos.hpp" - CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wpass-failed") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") @@ -181,7 +180,7 @@ template <size_t N, bool A = false, bool split = false, typename T> KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src) { cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); - if (split) + if constexpr (split) temp = splitpairs(temp); return temp; } @@ -190,7 +189,7 @@ template <size_t N, bool A = false, bool split = false, typename T> KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value) { cvec<T, N> v = value; - if (split) + if constexpr (split) v = interleavehalves(v); v.write(ptr_cast<T>(dest), cbool_t<A>()); } @@ -278,12 +277,14 @@ KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices template <size_t N, size_t stride, typename T> KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base) { - if (stride == 1) + if constexpr (stride == 1) { return ref_cast<cvec<T, N>>(*base); } else + { return cgather_helper<N, stride, T>(base, csizeseq_t<N>()); + } } KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) @@ -342,7 +343,7 @@ KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, cs template <size_t N, size_t stride, typename T> KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value) { - if (stride == 1) + if constexpr (stride == 1) { cwrite<N>(base, value); } @@ -510,35 +511,35 @@ template <size_t k, size_t size, bool inverse = false, typename T, size_t width, KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x) { constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485); - if (kk == 0) + if constexpr (kk == 0) { return x; } - else if (kk == size * 1 / 8) + else if constexpr (kk == size * 1 / 8) { return swap<2>(subadd(swap<2>(x), x)) * isqrt2; } - else if (kk == size * 2 / 8) + else if constexpr (kk == size * 2 / 8) { return negodd(swap<2>(x)); } - else if (kk == size * 3 / 8) + else if constexpr (kk == size * 3 / 8) { return subadd(x, swap<2>(x)) * -isqrt2; } - else if (kk == size * 4 / 8) + else if constexpr (kk == size * 4 / 8) { return -x; } - else if (kk == size * 5 / 8) + else if constexpr (kk == size * 5 / 8) { return swap<2>(subadd(swap<2>(x), x)) * -isqrt2; } - else if (kk == size * 6 / 8) + else if constexpr (kk == size * 6 / 8) { return swap<2>(negodd(x)); } - else if (kk == size * 7 / 8) + else if constexpr (kk == size * 7 / 8) { return subadd(x, swap<2>(x)) * isqrt2; } @@ -582,7 +583,7 @@ KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, c diff13 = high(diff0213); w0 = sum02 + sum13; w2 = sum02 - sum13; - if (inverse) + if constexpr (inverse) { diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); diff13 = swap<2>(diff13); @@ -679,7 +680,7 @@ KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, } template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8) +KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8) { cvec<T, 2> w0, w1, w2, w3; split<T, 16>(v8, w0, w1, w2, w3); @@ -688,7 +689,7 @@ KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8) } template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32) +KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32) { cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; split(v32, w0, w1, w2, w3, w4, w5, w6, w7); @@ -710,7 +711,7 @@ KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32) } template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123) +KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123) { cvec<T, N> a0; cvec<T, N> a1; @@ -722,7 +723,7 @@ KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123) } template <size_t N, typename T> -KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01) +KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01) { cvec<T, N> a0; cvec<T, N> a1; @@ -734,14 +735,14 @@ KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01) template <size_t N, bool inverse = false, bool split_format = false, typename T> KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) { - if (split_format) + if constexpr (split_format) { vec<T, N> re1, im1, tw1re, tw1im; split<T, 2 * N>(a1, re1, im1); split<T, 2 * N>(tw1, tw1re, tw1im); vec<T, N> b1re = re1 * tw1re; vec<T, N> b1im = im1 * tw1re; - if (inverse) + if constexpr (inverse) w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im); else w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im); @@ -752,7 +753,7 @@ KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cv const cvec<T, N> a1_ = swap<2>(a1); cvec<T, N> tw1_ = tw1; - if (inverse) + if constexpr (inverse) tw1_ = -(tw1_); w1 = subadd(b1, a1_ * dupodd(tw1_)); } @@ -839,37 +840,38 @@ KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) } template <bool inverse, bool aligned, typename T> -KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in) +KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, + const complex<T>* in) { cvec<T, 16> w0, w1, w2, w3; w0 = cread_group<4, 4, 16, aligned>( in); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48)); - butterfly4<4, inverse>(w0); + butterfly4_packed<4, inverse>(w0); apply_twiddles4<0, 1, 4, inverse>(w0); w1 = cread_group<4, 4, 16, aligned>( in + 4); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52)); - butterfly4<4, inverse>(w1); + butterfly4_packed<4, inverse>(w1); apply_twiddles4<4, 1, 4, inverse>(w1); w2 = cread_group<4, 4, 16, aligned>( in + 8); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56)); - butterfly4<4, inverse>(w2); + butterfly4_packed<4, inverse>(w2); apply_twiddles4<8, 1, 4, inverse>(w2); w3 = cread_group<4, 4, 16, aligned>( in + 12); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60)); - butterfly4<4, inverse>(w3); + butterfly4_packed<4, inverse>(w3); apply_twiddles4<12, 1, 4, inverse>(w3); transpose4(w0, w1, w2, w3); // pass 2: - butterfly4<4, inverse>(w0); - butterfly4<4, inverse>(w1); - butterfly4<4, inverse>(w2); - butterfly4<4, inverse>(w3); + butterfly4_packed<4, inverse>(w0); + butterfly4_packed<4, inverse>(w1); + butterfly4_packed<4, inverse>(w2); + butterfly4_packed<4, inverse>(w3); transpose4(w0, w1, w2, w3); @@ -881,26 +883,26 @@ KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* o apply_vertical_twiddles4<4, inverse>(w1, w2, w3); // pass 3: - butterfly4<4, inverse>(w3); + butterfly4_packed<4, inverse>(w3); cwrite_group<4, 4, 16, aligned>(out + 12, w3); // split(w3, out[3], out[7], out[11], out[15]); - butterfly4<4, inverse>(w2); + butterfly4_packed<4, inverse>(w2); cwrite_group<4, 4, 16, aligned>(out + 8, w2); // split(w2, out[2], out[6], out[10], out[14]); - butterfly4<4, inverse>(w1); + butterfly4_packed<4, inverse>(w1); cwrite_group<4, 4, 16, aligned>(out + 4, w1); // split(w1, out[1], out[5], out[9], out[13]); - butterfly4<4, inverse>(w0); + butterfly4_packed<4, inverse>(w0); cwrite_group<4, 4, 16, aligned>(out, w0); // split(w0, out[0], out[4], out[8], out[12]); } template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly16(cvec<T, 16>& v16) +KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16) { - butterfly4<4, inverse>(v16); + butterfly4_packed<4, inverse>(v16); apply_twiddles4<0, 4, 4, inverse>(v16); v16 = digitreverse4<2>(v16); - butterfly4<4, inverse>(v16); + butterfly4_packed<4, inverse>(v16); } template <size_t index, bool inverse = false, typename T> @@ -1505,7 +1507,7 @@ template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) { vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr)); - if (transposed) + if constexpr (transposed) temp = ctranspose<sizeof...(N)>(temp); split(temp, w...); } @@ -1533,7 +1535,7 @@ template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) { auto temp = concat(args...); - if (transposed) + if constexpr (transposed) temp = ctransposeinverse<sizeof...(N)>(temp); write(ptr_cast<T>(ptr), temp); } @@ -1634,7 +1636,7 @@ KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t const cvec<T, 1> ina = cread<1>(in + (1 + j)); const cvec<T, 1> inb = cread<1>(in + radix - (j + 1)); cvec<T, width> tw = cread<width>(twiddle); - if (inverse) + if constexpr (inverse) tw = negodd /*cconj*/ (tw); cmul_2conj(sum0, sum1, ina, inb, tw); @@ -1642,9 +1644,6 @@ KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t } twiddle = twiddle - halfradix_sqr + width; - // if (inverse) - // std::swap(sum0, sum1); - if (is_constant_val(ostride)) { cwrite<width>(out + (1 + i), sum0); diff --git a/include/kfr/simd/digitreverse.hpp b/include/kfr/simd/digitreverse.hpp @@ -39,29 +39,37 @@ CMT_PRAGMA_GNU(GCC diagnostic push) CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative") -constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; } +constexpr KFR_INTRINSIC u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; } template <u32 m, u32 shift, u32... values> -constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>) +constexpr KFR_INTRINSIC u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>) { return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>()); } template <size_t bits> -constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>) +constexpr KFR_INTRINSIC u32 digitreverse_impl(u32 x, csize_t<2>) { - return bit_permute_step_impl( - x, - cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >> - (32 - bits); + constexpr cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16> + steps{}; + if constexpr (bits > 16) + return bit_permute_step_impl(x, steps) >> (32 - bits); + else if constexpr (bits > 8) + return bit_permute_step_impl(x, steps[csizeseq<8>]) >> (16 - bits); + else + return bit_permute_step_impl(x, steps[csizeseq<6>]) >> (8 - bits); } template <size_t bits> -constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>) +constexpr KFR_INTRINSIC u32 digitreverse_impl(u32 x, csize_t<4>) { - return bit_permute_step_impl( - x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >> - (32 - bits); + constexpr cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16> steps{}; + if constexpr (bits > 16) + return bit_permute_step_impl(x, steps) >> (32 - bits); + else if constexpr (bits > 8) + return bit_permute_step_impl(x, steps[csizeseq<6>]) >> (16 - bits); + else + return bit_permute_step_impl(x, steps[csizeseq<4>]) >> (8 - bits); } CMT_PRAGMA_GNU(GCC diagnostic pop) @@ -69,7 +77,7 @@ CMT_PRAGMA_GNU(GCC diagnostic pop) template <size_t radix, size_t bits> struct shuffle_index_digitreverse { - constexpr inline size_t operator()(size_t index) const CMT_NOEXCEPT + constexpr KFR_INTRINSIC size_t operator()(size_t index) const CMT_NOEXCEPT { return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>()); } @@ -96,13 +104,13 @@ KFR_INTRINSIC vec<T, N> digitreverse4(const vec<T, N>& x) } template <size_t bits> -constexpr inline u32 bitreverse(u32 x) +constexpr KFR_INTRINSIC u32 bitreverse(u32 x) { return internal::digitreverse_impl<bits>(x, csize_t<2>()); } template <size_t bits> -constexpr inline u32 digitreverse4(u32 x) +constexpr KFR_INTRINSIC u32 digitreverse4(u32 x) { return internal::digitreverse_impl<bits>(x, csize_t<4>()); } diff --git a/include/kfr/simd/impl/specializations.hpp b/include/kfr/simd/impl/specializations.hpp @@ -22,22 +22,20 @@ #include "../shuffle.hpp" #endif -#ifdef KFR_COMPILER_GNU +#ifdef CMT_COMPILER_GNU namespace kfr { inline namespace CMT_ARCH_NAME { -namespace intrinsics -{ template <> -inline vec<f32, 32> shufflevector<f32, 32>( +template <> +inline vec<f32, 32> vec<f32, 32>::shuffle( csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, - 15, 22, 23, 30, 31>, - const vec<f32, 32>& x, const vec<f32, 32>&) + 15, 22, 23, 30, 31>) const CMT_NOEXCEPT { - f32x32 w = x; + f32x32 w = *this; w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)), permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w))); @@ -47,12 +45,12 @@ inline vec<f32, 32> shufflevector<f32, 32>( } template <> -inline vec<f32, 32> shufflevector<f32, 32>( +template <> +inline vec<f32, 32> vec<f32, 32>::shuffle( csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22, - 23, 14, 15, 30, 31>, - const vec<f32, 32>& x, const vec<f32, 32>&) + 23, 14, 15, 30, 31>) const CMT_NOEXCEPT { - f32x32 w = x; + f32x32 w = *this; w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)), permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w))); @@ -63,54 +61,55 @@ inline vec<f32, 32> shufflevector<f32, 32>( inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x) { - return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, - 19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>, - x, x); + return x.shuffle(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, + 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>); } template <> -inline vec<f32, 64> shufflevector<f32, 64>( +template <> +inline vec<f32, 64> vec<f32, 64>::shuffle( csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13, 44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38, - 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>, - const vec<f32, 64>& x, const vec<f32, 64>&) + 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>) const CMT_NOEXCEPT { return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>( - concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x)))); + concat(bitreverse_2(even<8>(*this)), bitreverse_2(odd<8>(*this)))); } template <> -inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>, - const vec<f32, 16>& x, const vec<f32, 16>&) +template <> +inline vec<f32, 16> vec<f32, 16>::shuffle( + csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>) const CMT_NOEXCEPT { - // asm volatile("int $3"); - const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x); + const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(*this); - return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)), - shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx))); + return concat(low(xx).shuffle(high(xx), csizes<0, 2, 8 + 0, 8 + 2, 4, 6, 8 + 4, 8 + 6>), + low(xx).shuffle(high(xx), csizes<1, 3, 8 + 1, 8 + 3, 5, 7, 8 + 5, 8 + 7>)); } template <> -inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>, - const vec<f32, 16>& x, const vec<f32, 16>&) +template <> +inline vec<f32, 16> vec<f32, 16>::shuffle( + csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>) const CMT_NOEXCEPT { const vec<f32, 16> xx = - concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); + concat(low(*this).shuffle(high(*this), csizes<0, 8 + 0, 1, 8 + 1, 4, 8 + 4, 5, 8 + 5>), + low(*this).shuffle(high(*this), csizes<2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7>)); return permutegroups<(4), 0, 2, 1, 3>(xx); } template <> -inline vec<f32, 32> shufflevector<f32, 32>( +template <> +inline vec<f32, 32> vec<f32, 32>::shuffle( csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, - 29, 14, 30, 15, 31>, - const vec<f32, 32>& x, const vec<f32, 32>&) + 29, 14, 30, 15, 31>) const CMT_NOEXCEPT { - const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x); + const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(*this); return concat(interleavehalves(low(xx)), interleavehalves(high(xx))); } -} // namespace intrinsics + } // namespace CMT_ARCH_NAME } // namespace kfr #endif diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -18,6 +18,15 @@ using namespace kfr; namespace CMT_ARCH_NAME { +TEST(print_vector_capacity) +{ + println("vector_capacity<float> = ", vector_capacity<float>); + println("vector_capacity<double> = ", vector_capacity<double>); + + println("fft_config<float>::process_width = ", vector_capacity<float> / 16); + println("fft_config<double>::process_width = ", vector_capacity<double> / 16); +} + #ifdef CMT_NATIVE_F64 constexpr ctypes_t<float, double> dft_float_types{}; #else @@ -78,7 +87,7 @@ static void perf_test(int size) TEST(test_performance) { - for (int size = 16; size <= 16384; size <<= 1) + for (int size = 16; size <= 65536; size <<= 1) { perf_test(size); } @@ -160,7 +169,7 @@ constexpr size_t fft_stopsize = 12; constexpr size_t dft_stopsize = 101; #endif #else -constexpr size_t fft_stopsize = 20; +constexpr size_t fft_stopsize = 21; #ifndef KFR_DFT_NO_NPo2 constexpr size_t dft_stopsize = 257; #endif @@ -183,15 +192,14 @@ TEST(fft_real) TEST(fft_real_not_size_4N) { kfr::univector<double, 6> in = counter(); - auto out = realdft(in); - kfr::univector<kfr::complex<double>> expected { - 15.0, { -3, 5.19615242}, {-3, +1.73205081}, -3.0 }; + auto out = realdft(in); + kfr::univector<kfr::complex<double>> expected{ 15.0, { -3, 5.19615242 }, { -3, +1.73205081 }, -3.0 }; CHECK(rms(cabs(out - expected)) <= 0.00001f); kfr::univector<double, 6> rev = irealdft(out) / 6; CHECK(rms(rev - in) <= 0.00001f); - random_state gen = random_init(2247448713, 915890490, 864203735, 2982561); - constexpr size_t size = 66; + random_state gen = random_init(2247448713, 915890490, 864203735, 2982561); + constexpr size_t size = 66; kfr::univector<double, size> in2 = gen_random_range<double>(gen, -1.0, +1.0); kfr::univector<kfr::complex<double>, size / 2 + 1> out2 = realdft(in2); kfr::univector<double, size> rev2 = irealdft(out2) / size;