kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 194011494c612013f9dc8d75e96bd7fbd97e9b75
parent a92f4352d611566e7bbf3590d0768a9f444d8c14
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 27 Nov 2018 20:04:10 +0000

Non-power of 2 DFT

Diffstat:
MREADME.md | 2+-
Minclude/kfr/dft/dft-src.cpp | 490+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Minclude/kfr/dft/fft.hpp | 16+++++++++++++---
Minclude/kfr/dft/ft.hpp | 330+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Minclude/kfr/testo/testo.hpp | 14++++++++++++--
Mtests/dft_test.cpp | 118+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
6 files changed, 723 insertions(+), 247 deletions(-)

diff --git a/README.md b/README.md @@ -15,7 +15,7 @@ KFR is a header-only library (except DFT) and has no external dependencies. * Full AVX-512 support * EBU R128 -* Non-power of two DFT (coming soon) +* Non-power of two DFT * TruePeak (coming soon) * Number of automatic tests has been increased * GPL version changed from 3 to 2+ diff --git a/include/kfr/dft/dft-src.cpp b/include/kfr/dft/dft-src.cpp @@ -45,7 +45,7 @@ CMT_PRAGMA_MSVC(warning(disable : 4100)) namespace kfr { -constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 10> dft_radices{}; +constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{}; #define DFT_ASSERT TESTO_ASSERT_INACTIVE @@ -58,22 +58,30 @@ using cinvert_t = ctrue_t; template <typename T> struct dft_stage { + size_t radix = 0; size_t stage_size = 0; size_t data_size = 0; size_t temp_size = 0; u8* data = nullptr; size_t repeats = 1; size_t out_offset = 0; - size_t width = 0; size_t blocks = 0; - const char* name; - bool recursion = false; - bool can_inplace = true; - bool inplace = false; - bool to_scratch = false; + const char* name = nullptr; + bool recursion = false; + bool can_inplace = true; + bool inplace = false; + bool to_scratch = false; + bool need_reorder = true; void initialize(size_t size) { do_initialize(size); } + virtual void dump() const + { + printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix, + stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace, + to_scratch); + } + KFR_INTRIN void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) { do_execute(cdirect_t(), out, in, temp); @@ -472,7 +480,7 @@ template <typename T> static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) { complex<T>* twiddle = ptr_cast<complex<T>>(stage->data); - const size_t N = stage->repeats * stage->stage_size; + const size_t N = stage->repeats * stage->radix; const size_t Nord = stage->repeats; size_t i = 0; @@ -482,7 +490,7 @@ static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) for (; i < Nord / width * width; i += width) { CMT_LOOP_NOUNROLL - for (size_t j = 1; j < stage->stage_size; j++) + for (size_t j = 1; j < stage->radix; j++) { CMT_LOOP_NOUNROLL for (size_t k = 0; k < width; k++) @@ -502,16 +510,17 @@ struct dft_stage_fixed_impl : dft_stage<T> { dft_stage_fixed_impl(size_t radix_, size_t iterations, size_t blocks) { - this->stage_size = radix; - this->blocks = blocks; - this->repeats = iterations; - this->recursion = false; // true; + this->name = type_name<decltype(*this)>(); + this->radix = radix; + this->blocks = blocks; + this->repeats = iterations; + this->recursion = false; // true; this->data_size = align_up((this->repeats * (radix - 1)) * sizeof(complex<T>), platform<>::native_cache_alignment); } -protected: - constexpr static size_t width = fft_vector_width<T>; + constexpr static size_t width = + radix >= 7 ? fft_vector_width<T> / 2 : radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2; virtual void do_initialize(size_t size) override final { dft_stage_fixed_initialize(this, width); } DFT_STAGE_FN @@ -521,7 +530,7 @@ protected: const size_t Nord = this->repeats; const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - const size_t N = Nord * this->stage_size; + const size_t N = Nord * this->radix; CMT_LOOP_NOUNROLL for (size_t b = 0; b < this->blocks; b++) { @@ -537,13 +546,16 @@ struct dft_stage_fixed_final_impl : dft_stage<T> { dft_stage_fixed_final_impl(size_t radix_, size_t iterations, size_t blocks) { - this->stage_size = radix; + this->name = type_name<decltype(*this)>(); + this->radix = radix; this->blocks = blocks; this->repeats = iterations; - this->recursion = false; // true; + this->recursion = false; this->can_inplace = false; } - constexpr static size_t width = fft_vector_width<T>; + constexpr static size_t width = + radix >= 7 ? fft_vector_width<T> / 2 : radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2; + DFT_STAGE_FN template <bool inverse> KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) @@ -555,19 +567,163 @@ struct dft_stage_fixed_final_impl : dft_stage<T> } }; +template <typename E> +inline E& apply_conj(E& e, cfalse_t) +{ + return e; +} + +template <typename E> +inline auto apply_conj(E& e, ctrue_t) +{ + return cconj(e); +} + +/// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1] +template <typename E> +struct fft_inverse : expression_base<E> +{ + using value_type = value_type_of<E>; + + CMT_INLINE fft_inverse(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {} + + CMT_INLINE vec<value_type, 1> operator()(cinput_t input, size_t index, vec_t<value_type, 1>) const + { + return this->argument_first(input, index == 0 ? 0 : this->size() - index, vec_t<value_type, 1>()); + } + + template <size_t N> + CMT_INLINE vec<value_type, N> operator()(cinput_t input, size_t index, vec_t<value_type, N>) const + { + if (index == 0) + { + return concat( + this->argument_first(input, index, vec_t<value_type, 1>()), + reverse(this->argument_first(input, this->size() - (N - 1), vec_t<value_type, N - 1>()))); + } + return reverse(this->argument_first(input, this->size() - index - (N - 1), vec_t<value_type, N>())); + } +}; + +template <typename E> +inline auto apply_fft_inverse(E&& e) +{ + return fft_inverse<E>(std::forward<E>(e)); +} + +template <typename T> +struct dft_arblen_stage_impl : dft_stage<T> +{ + dft_arblen_stage_impl(size_t size) + : fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal), size(size) + { + this->name = type_name<decltype(*this)>(); + this->radix = size; + this->blocks = 1; + this->repeats = 1; + this->recursion = false; + this->can_inplace = false; + this->temp_size = plan.temp_size; + + chirp_ = render(cexp(sqr(linspace(T(1) - size, size - T(1), size * 2 - 1, true, true)) * + complex<T>(0, -1) * c_pi<T> / size)); + + ichirpp_ = render(truncate(padded(1 / slice(chirp_, 0, 2 * size - 1)), fftsize)); + + univector<u8> temp(plan.temp_size); + plan.execute(ichirpp_, ichirpp_, temp); + xp.resize(fftsize, 0); + xp_fft.resize(fftsize); + invN2 = T(1) / fftsize; + } + + DFT_STAGE_FN + template <bool inverse> + KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + const size_t n = this->size; + const size_t N2 = this->fftsize; + + auto&& chirp = apply_conj(chirp_, cbool<inverse>); + + xp.slice(0, n) = make_univector(in, n) * slice(chirp, n - 1); + + plan.execute(xp_fft.data(), xp.data(), temp); + + if (inverse) + xp_fft = xp_fft * cconj(apply_fft_inverse(ichirpp_)); + else + xp_fft = xp_fft * ichirpp_; + plan.execute(xp_fft.data(), xp_fft.data(), temp, ctrue); + + make_univector(out, n) = xp_fft.slice(n - 1) * slice(chirp, n - 1) * invN2; + } + + const size_t size; + const size_t fftsize; + T invN2; + dft_plan<T> plan; + univector<complex<T>> chirp_; + univector<complex<T>> ichirpp_; + univector<complex<T>> xp; + univector<complex<T>> xp_fft; +}; + +template <typename T, size_t radix1, size_t radix2, size_t size = radix1* radix2> +struct dft_special_stage_impl : dft_stage<T> +{ + dft_special_stage_impl() : stage1(radix1, size / radix1, 1), stage2(radix2, 1, size / radix2) + { + this->name = type_name<decltype(*this)>(); + this->radix = size; + this->blocks = 1; + this->repeats = 1; + this->recursion = false; + this->can_inplace = false; + this->temp_size = stage1.temp_size + stage2.temp_size + sizeof(complex<T>) * size; + this->data_size = stage1.data_size + stage2.data_size; + } + void dump() const override + { + dft_stage<T>::dump(); + printf(" "); + stage1.dump(); + printf(" "); + stage2.dump(); + } + void do_initialize(size_t stage_size) override + { + stage1.data = this->data; + stage2.data = this->data + stage1.data_size; + stage1.initialize(stage_size); + stage2.initialize(stage_size); + } + DFT_STAGE_FN + template <bool inverse> + KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size); + stage1.do_execute(cbool<inverse>, scratch, in, temp); + stage2.do_execute(cbool<inverse>, out, scratch, temp + stage1.temp_size); + } + dft_stage_fixed_impl<T, radix1> stage1; + dft_stage_fixed_final_impl<T, radix2> stage2; +}; + template <typename T, bool final> struct dft_stage_generic_impl : dft_stage<T> { dft_stage_generic_impl(size_t radix, size_t iterations, size_t blocks) { - this->stage_size = radix; + this->name = type_name<decltype(*this)>(); + this->radix = radix; this->blocks = blocks; this->repeats = iterations; this->recursion = false; // true; this->can_inplace = false; this->temp_size = align_up(sizeof(complex<T>) * radix, platform<>::native_cache_alignment); this->data_size = - align_up(sizeof(complex<T>) * sqr(this->stage_size / 2), platform<>::native_cache_alignment); + align_up(sizeof(complex<T>) * sqr(this->radix / 2), platform<>::native_cache_alignment); } protected: @@ -575,13 +731,12 @@ protected: { complex<T>* twiddle = ptr_cast<complex<T>>(this->data); CMT_LOOP_NOUNROLL - for (size_t i = 0; i < this->stage_size / 2; i++) + for (size_t i = 0; i < this->radix / 2; i++) { CMT_LOOP_NOUNROLL - for (size_t j = 0; j < this->stage_size / 2; j++) + for (size_t j = 0; j < this->radix / 2; j++) { - cwrite<1>(twiddle++, - cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->stage_size))); + cwrite<1>(twiddle++, cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->radix))); } } } @@ -593,17 +748,17 @@ protected: const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); const size_t bl = this->blocks; const size_t Nord = this->repeats; - const size_t N = Nord * this->stage_size; + const size_t N = Nord * this->radix; CMT_LOOP_NOUNROLL for (size_t b = 0; b < bl; b++) - generic_butterfly(this->stage_size, cbool<inverse>, out + b, in + b * this->stage_size, + generic_butterfly(this->radix, cbool<inverse>, out + b, in + b * this->radix, ptr_cast<complex<T>>(temp), twiddle, bl); } }; template <typename T, typename Tr2> -inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 r2) +inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 first_radix) { CMT_ASSUME(r0 > 1); CMT_ASSUME(r1 > 1); @@ -617,20 +772,20 @@ inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t { const complex<T>* in2 = in1; CMT_LOOP_UNROLL - for (size_t j = 0; j < r2; j++) + for (size_t j = 0; j < first_radix; j++) { *out++ = *in2; in2 += r1; } in1++; - in += r2; + in += first_radix; } } } -template <typename T> +template <typename T, typename Tr2> inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_t* radices, size_t count, - size_t index, size_t inscale, size_t inner_size) + size_t index, size_t inscale, size_t inner_size, Tr2 first_radix) { const bool b = index == 1; const size_t radix = radices[index]; @@ -641,7 +796,7 @@ inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_ { const complex<T>* in1 = in; CMT_LOOP_UNROLL - for (size_t j = 0; j < radices[0]; j++) + for (size_t j = 0; j < first_radix; j++) { *out++ = *in1; in1 += inner_size; @@ -656,7 +811,7 @@ inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_ CMT_LOOP_NOUNROLL for (size_t i = 0; i < steps; i++) { - dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size); + dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size, first_radix); in += inscale; } } @@ -667,15 +822,15 @@ struct dft_reorder_stage_impl : dft_stage<T> { dft_reorder_stage_impl(const int* radices, size_t count) : count(count) { + this->name = type_name<decltype(*this)>(); this->can_inplace = false; this->data_size = 0; - std::copy(std::make_reverse_iterator(radices + count), std::make_reverse_iterator(radices), - this->radices); + std::copy(radices, radices + count, this->radices); this->inner_size = 1; this->size = 1; for (size_t r = 0; r < count; r++) { - if (r != 0 && r != count - 2) + if (r != 0 && r != count - 1) this->inner_size *= radices[r]; this->size *= radices[r]; } @@ -692,38 +847,37 @@ protected: template <bool inverse> KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) { - // std::copy(in, in + this->size, out); - // return; - if (count == 3) - { - dft_permute(out, in, radices[2], radices[1], radices[0]); - } - else - { - const size_t rlast = radices[count - 1]; - for (size_t p = 0; p < rlast; p++) - { - dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size); - in += size / rlast; - } - } - // if (count == 1) - // { - // cswitch(dft_radices, radices[0], [&](auto rfirst) { dft_permute3(out, in, 1, 1, rfirst); - // }, - // [&]() { dft_permute3(out, in, 1, 1, radices[0]); }); - // } - - // const size_t rlast = radices[count - 1]; - // const size_t size = this->size / rlast; - // - // cswitch(dft_radices, radices[0], [&](auto rfirst) { - // for (size_t p = 0; p < rlast; p++) - // { - // reorder_impl(out, in, index, 1, rfirst); - // in += size; - // } - // }); + cswitch(dft_radices, radices[0], + [&](auto first_radix) { + if (count == 3) + { + dft_permute(out, in, radices[2], radices[1], first_radix); + } + else + { + const size_t rlast = radices[count - 1]; + for (size_t p = 0; p < rlast; p++) + { + dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, first_radix); + in += size / rlast; + } + } + }, + [&]() { + if (count == 3) + { + dft_permute(out, in, radices[2], radices[1], radices[0]); + } + else + { + const size_t rlast = radices[count - 1]; + for (size_t p = 0; p < rlast; p++) + { + dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, radices[0]); + in += size / rlast; + } + } + }); } }; @@ -732,6 +886,8 @@ struct fft_stage_impl : dft_stage<T> { fft_stage_impl(size_t stage_size) { + this->name = type_name<decltype(*this)>(); + this->radix = 4; this->stage_size = stage_size; this->repeats = 4; this->recursion = true; @@ -770,6 +926,8 @@ struct fft_final_stage_impl : dft_stage<T> { fft_final_stage_impl(size_t) { + this->name = type_name<decltype(*this)>(); + this->radix = size; this->stage_size = size; this->out_offset = size; this->repeats = 4; @@ -863,6 +1021,7 @@ struct fft_reorder_stage_impl : dft_stage<T> { fft_reorder_stage_impl(size_t stage_size) { + this->name = type_name<decltype(*this)>(); this->stage_size = stage_size; log2n = ilog2(stage_size); this->data_size = 0; @@ -887,7 +1046,7 @@ struct fft_specialization; template <typename T> struct fft_specialization<T, 1> : dft_stage<T> { - fft_specialization(size_t) {} + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } protected: constexpr static bool aligned = false; @@ -905,7 +1064,7 @@ protected: template <typename T> struct fft_specialization<T, 2> : dft_stage<T> { - fft_specialization(size_t) {} + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } protected: constexpr static bool aligned = false; @@ -923,7 +1082,7 @@ protected: template <typename T> struct fft_specialization<T, 3> : dft_stage<T> { - fft_specialization(size_t) {} + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } protected: constexpr static bool aligned = false; @@ -940,7 +1099,7 @@ protected: template <typename T> struct fft_specialization<T, 4> : dft_stage<T> { - fft_specialization(size_t) {} + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } protected: constexpr static bool aligned = false; @@ -957,7 +1116,7 @@ protected: template <typename T> struct fft_specialization<T, 5> : dft_stage<T> { - fft_specialization(size_t) {} + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } protected: constexpr static bool aligned = false; @@ -974,7 +1133,7 @@ protected: template <typename T> struct fft_specialization<T, 6> : dft_stage<T> { - fft_specialization(size_t) {} + fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } protected: constexpr static bool aligned = false; @@ -991,6 +1150,7 @@ struct fft_specialization<T, 7> : dft_stage<T> { fft_specialization(size_t) { + this->name = type_name<decltype(*this)>(); this->stage_size = 128; this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); } @@ -1018,7 +1178,8 @@ protected: { const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); final_pass<inverse>(csize_t<final_size>(), out, in, twiddle); - fft_reorder(out, csize_t<7>()); + if (this->need_reorder) + fft_reorder(out, csize_t<7>()); } template <bool inverse> @@ -1045,7 +1206,11 @@ protected: template <> struct fft_specialization<float, 8> : dft_stage<float> { - fft_specialization(size_t) { this->temp_size = sizeof(complex<float>) * 256; } + fft_specialization(size_t) + { + this->name = type_name<decltype(*this)>(); + this->temp_size = sizeof(complex<float>) * 256; + } protected: using T = float; @@ -1085,42 +1250,54 @@ template <> struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256> { using T = double; - using fft_final_stage_impl<double, false, 256>::fft_final_stage_impl; + fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size) + { + this->name = type_name<decltype(*this)>(); + } DFT_STAGE_FN template <bool inverse> KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) { fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr); - fft_reorder(out, csize_t<8>()); + if (this->need_reorder) + fft_reorder(out, csize_t<8>()); } }; template <typename T> struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512> { - using fft_final_stage_impl<T, false, 512>::fft_final_stage_impl; + fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size) + { + this->name = type_name<decltype(*this)>(); + } DFT_STAGE_FN template <bool inverse> KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) { fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr); - fft_reorder(out, csize_t<9>()); + if (this->need_reorder) + fft_reorder(out, csize_t<9>()); } }; template <typename T> struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> { - using fft_final_stage_impl<T, false, 1024>::fft_final_stage_impl; + fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size) + { + this->name = type_name<decltype(*this)>(); + } DFT_STAGE_FN template <bool inverse> KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*) { fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr); - fft_reorder(out, 10, cfalse); + if (this->need_reorder) + fft_reorder(out, 10, cfalse); } }; @@ -1133,7 +1310,7 @@ template <typename Stage, typename... Args> void dft_plan<T>::add_stage(Args... args) { dft_stage<T>* stage = new Stage(args...); - stage->name = nullptr; + stage->need_reorder = need_reorder; this->data_size += stage->data_size; this->temp_size += stage->temp_size; stages.push_back(dft_stage_ptr(stage)); @@ -1207,12 +1384,13 @@ void dft_plan<T>::initialize() offset += stage->data_size; } - bool to_scratch = false; + bool to_scratch = false; + bool scratch_needed = false; for (dft_stage_ptr& stage : reversed(stages)) { if (to_scratch) { - this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment); + scratch_needed = true; } stage->to_scratch = to_scratch; if (!stage->can_inplace) @@ -1220,14 +1398,16 @@ void dft_plan<T>::initialize() to_scratch = !to_scratch; } } + if (scratch_needed || !stages[0]->can_inplace) + this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment); } template <typename T> const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in, - const complex<T>* scratch) const + const complex<T>* scratch, bool in_scratch) const { if (stage == 0) - return in; + return in_scratch ? scratch : in; return stages[stage - 1]->to_scratch ? scratch : out; } @@ -1241,12 +1421,22 @@ template <typename T> template <bool inverse> void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const { + if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) + { + return stages[0]->execute(cbool<inverse>, out, in, temp); + } size_t stack[32] = { 0 }; complex<T>* scratch = ptr_cast<complex<T>>(temp + this->temp_size - align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment)); + bool in_scratch = !stages[0]->can_inplace && in == out; + if (in_scratch) + { + internal::builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size); + } + const size_t count = stages.size(); for (size_t depth = 0; depth < count;) @@ -1266,7 +1456,7 @@ void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T else { complex<T>* rout = select_out(rdepth, out, scratch); - const complex<T>* rin = select_in(rdepth, out, in, scratch); + const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch); stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); offset += stages[rdepth]->out_offset; stack[rdepth]++; @@ -1281,15 +1471,16 @@ void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T else { stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch), - select_in(depth, out, in, scratch), temp); + select_in(depth, out, in, scratch, in_scratch), temp); depth++; } } } template <typename T> -dft_plan<T>::dft_plan(size_t size, dft_type) : size(size), temp_size(0), data_size(0) +dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0) { + need_reorder = true; if (is_poweroftwo(size)) { const size_t log2n = ilog2(size); @@ -1303,62 +1494,82 @@ dft_plan<T>::dft_plan(size_t size, dft_type) : size(size), temp_size(0), data_si cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { this->make_fft(size, is_even, ctrue); constexpr size_t is_evenv = val_of(decltype(is_even)()); - this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size); + if (need_reorder) + this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size); }); }); } +#ifndef KFR_DFT_NO_NPo2 else { - size_t cur_size = size; - constexpr size_t radices_count = dft_radices.back() + 1; - u8 count[radices_count] = { 0 }; - int radices[32] = { 0 }; - size_t radices_size = 0; + if (size == 60) + { + this->add_stage<internal::dft_special_stage_impl<T, 6, 10>>(); + } + else if (size == 48) + { + this->add_stage<internal::dft_special_stage_impl<T, 6, 8>>(); + } + else + { + size_t cur_size = size; + constexpr size_t radices_count = dft_radices.back() + 1; + u8 count[radices_count] = { 0 }; + int radices[32] = { 0 }; + size_t radices_size = 0; + + cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) { + while (cur_size && cur_size % val_of(radix) == 0) + { + count[val_of(radix)]++; + cur_size /= val_of(radix); + } + }); - cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) { - while (cur_size && cur_size % val_of(radix) == 0) + if (cur_size >= 101) { - count[val_of(radix)]++; - cur_size /= val_of(radix); + this->add_stage<internal::dft_arblen_stage_impl<T>>(size); } - }); + else + { + size_t blocks = 1; + size_t iterations = size; - size_t blocks = 1; - size_t iterations = size; + for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++) + { + for (size_t i = 0; i < count[r]; i++) + { + iterations /= r; + radices[radices_size++] = r; + if (iterations == 1) + this->prepare_dft_stage(r, iterations, blocks, ctrue); + else + this->prepare_dft_stage(r, iterations, blocks, cfalse); + blocks *= r; + } + } - for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++) - { - for (size_t i = 0; i < count[r]; i++) - { - iterations /= r; - radices[radices_size++] = r; - if (iterations == 1) - this->prepare_dft_stage(r, iterations, blocks, ctrue); - else - this->prepare_dft_stage(r, iterations, blocks, cfalse); - blocks *= r; - } - } + if (cur_size > 1) + { + iterations /= cur_size; + radices[radices_size++] = cur_size; + if (iterations == 1) + this->prepare_dft_stage(cur_size, iterations, blocks, ctrue); + else + this->prepare_dft_stage(cur_size, iterations, blocks, cfalse); + } - if (cur_size > 1) - { - iterations /= cur_size; - radices[radices_size++] = cur_size; - if (iterations == 1) - this->prepare_dft_stage(cur_size, iterations, blocks, ctrue); - else - this->prepare_dft_stage(cur_size, iterations, blocks, cfalse); + if (stages.size() > 2) + this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size); + } } - - if (stages.size() > 2) - this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size); } +#endif initialize(); } template <typename T> -dft_plan_real<T>::dft_plan_real(size_t size, dft_type type) - : dft_plan<T>(size / 2, type), size(size), rtwiddle(size / 4) +dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4) { using namespace internal; @@ -1463,6 +1674,15 @@ dft_plan<T>::~dft_plan() { } +template <typename T> +void dft_plan<T>::dump() const +{ + for (const dft_stage_ptr& s : stages) + { + s->dump(); + } +} + namespace internal { @@ -1621,24 +1841,26 @@ template void convolve_filter<double>::set_data(const univector<double>&); template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size); template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size); -template dft_plan<float>::dft_plan(size_t, dft_type); +template dft_plan<float>::dft_plan(size_t, dft_order); template dft_plan<float>::~dft_plan(); +template void dft_plan<float>::dump() const; template void dft_plan<float>::execute_dft(cometa::cbool_t<false>, kfr::complex<float>* out, const kfr::complex<float>* in, kfr::u8* temp) const; template void dft_plan<float>::execute_dft(cometa::cbool_t<true>, kfr::complex<float>* out, const kfr::complex<float>* in, kfr::u8* temp) const; -template dft_plan_real<float>::dft_plan_real(size_t, dft_type); +template dft_plan_real<float>::dft_plan_real(size_t); template void dft_plan_real<float>::from_fmt(kfr::complex<float>* out, const kfr::complex<float>* in, kfr::dft_pack_format fmt) const; template void dft_plan_real<float>::to_fmt(kfr::complex<float>* out, kfr::dft_pack_format fmt) const; -template dft_plan<double>::dft_plan(size_t, dft_type); +template dft_plan<double>::dft_plan(size_t, dft_order); template dft_plan<double>::~dft_plan(); +template void dft_plan<double>::dump() const; template void dft_plan<double>::execute_dft(cometa::cbool_t<false>, kfr::complex<double>* out, const kfr::complex<double>* in, kfr::u8* temp) const; template void dft_plan<double>::execute_dft(cometa::cbool_t<true>, kfr::complex<double>* out, const kfr::complex<double>* in, kfr::u8* temp) const; -template dft_plan_real<double>::dft_plan_real(size_t, dft_type); +template dft_plan_real<double>::dft_plan_real(size_t); template void dft_plan_real<double>::from_fmt(kfr::complex<double>* out, const kfr::complex<double>* in, kfr::dft_pack_format fmt) const; template void dft_plan_real<double>::to_fmt(kfr::complex<double>* out, kfr::dft_pack_format fmt) const; diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp @@ -54,6 +54,12 @@ enum class dft_type inverse }; +enum class dft_order +{ + normal, + internal, // possibly bit/digit-reversed, implementation-defined, faster +}; + template <typename T> struct dft_stage; @@ -65,7 +71,9 @@ struct dft_plan size_t size; size_t temp_size; - dft_plan(size_t size, dft_type = dft_type::both); + dft_plan(size_t size, dft_order order = dft_order::normal); + + void dump() const; KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const { @@ -101,6 +109,7 @@ protected: autofree<u8> data; size_t data_size; std::vector<dft_stage_ptr> stages; + bool need_reorder; template <typename Stage, typename... Args> void add_stage(Args... args); @@ -115,7 +124,8 @@ protected: template <bool inverse> void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const; - const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, const complex<T>* scratch) const; + const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, + const complex<T>* scratch, bool in_scratch) const; complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const; }; @@ -129,7 +139,7 @@ template <typename T> struct dft_plan_real : dft_plan<T> { size_t size; - dft_plan_real(size_t size, dft_type = dft_type::both); + dft_plan_real(size_t size); void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete; diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp @@ -542,8 +542,10 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x) template <size_t N, typename T> KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) { - w0 = a0 + a1; - w1 = a0 - a1; + const cvec<T, N> sum = a0 + a1; + const cvec<T, N> dif = a0 - a1; + w0 = sum; + w1 = dif; } template <size_t N, typename T> @@ -1016,21 +1018,25 @@ KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1) a1 = cmul(a1, tw1); } +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw3r1 = static_cast<T>(-0.5 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw3i1 = + static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>(); + template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly3(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, - cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02) +KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01, + cvec<T, N>& w02) { - const static cvec<T, N> tw3r1 = static_cast<T>(-0.5 - 1.0); - const static cvec<T, N> tw3i1 = - static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>(); const cvec<T, N> sum1 = a01 + a02; const cvec<T, N> dif1 = swap<2>(a01 - a02); w00 = a00 + sum1; - const cvec<T, N> s1 = w00 + sum1 * tw3r1; + const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>; - const cvec<T, N> d1 = dif1 * tw3i1; + const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>; w01 = s1 + d1; w02 = s1 - d1; @@ -1073,22 +1079,80 @@ KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec< butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5); } +template <typename T, bool inverse = false> +const static cvec<T, 1> tw9_1 = { T(0.76604444311897803520239265055541), + (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) }; +template <typename T, bool inverse = false> +const static cvec<T, 1> tw9_2 = { T(0.17364817766693034885171662676931), + (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) }; +template <typename T, bool inverse = false> +const static cvec<T, 1> tw9_4 = { T(-0.93969262078590838405410927732473), + (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) }; + template <size_t N, bool inverse = false, typename T> -KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, - const cvec<T, N>& a03, const cvec<T, N>& a04, const cvec<T, N>& a05, - const cvec<T, N>& a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, - cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06) +KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, cvec<T, N>& w0, + cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, + cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8) +{ + cvec<T, N* 3> a012 = concat(a0, a1, a2); + cvec<T, N* 3> a345 = concat(a3, a4, a5); + cvec<T, N* 3> a678 = concat(a6, a7, a8); + butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678); + cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8; + split(a012, t0, t1, t2); + split(a345, t3, t4, t5); + split(a678, t6, t7, t8); + + t4 = cmul(t4, tw9_1<T, inverse>); + t5 = cmul(t5, tw9_2<T, inverse>); + t7 = cmul(t7, tw9_2<T, inverse>); + t8 = cmul(t8, tw9_4<T, inverse>); + + cvec<T, N* 3> t036 = concat(t0, t3, t6); + cvec<T, N* 3> t147 = concat(t1, t4, t7); + cvec<T, N* 3> t258 = concat(t2, t5, t8); + + butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258); + split(t036, w0, w1, w2); + split(t147, w3, w4, w5); + split(t258, w6, w7, w8); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRIN void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8) { - const static cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884 - 1.0); - const static cvec<T, N> tw7i1 = - static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>(); - const static cvec<T, N> tw7r2 = static_cast<T>(-0.2225209339563144042889025645 - 1.0); - const static cvec<T, N> tw7i2 = - static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>(); - const static cvec<T, N> tw7r3 = static_cast<T>(-0.90096886790241912623610231951 - 1.0); - const static cvec<T, N> tw7i3 = - static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>(); + butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8); +} + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw7i1 = + static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw7r2 = static_cast<T>(-0.2225209339563144042889025645 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw7i2 = + static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw7r3 = static_cast<T>(-0.90096886790241912623610231951 - 1.0); +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw7i3 = + static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>(); + +template <size_t N, bool inverse = false, typename T> +KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, + cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, + cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06) +{ const cvec<T, N> sum1 = a01 + a06; const cvec<T, N> dif1 = swap<2>(a01 - a06); const cvec<T, N> sum2 = a02 + a05; @@ -1097,13 +1161,19 @@ KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const c const cvec<T, N> dif3 = swap<2>(a03 - a04); w00 = a00 + sum1 + sum2 + sum3; - const cvec<T, N> s1 = w00 + sum1 * tw7r1 + sum2 * tw7r2 + sum3 * tw7r3; - const cvec<T, N> s2 = w00 + sum1 * tw7r2 + sum2 * tw7r3 + sum3 * tw7r1; - const cvec<T, N> s3 = w00 + sum1 * tw7r3 + sum2 * tw7r1 + sum3 * tw7r2; + const cvec<T, N> s1 = + w00 + sum1 * tw7r1<T, N, inverse> + sum2 * tw7r2<T, N, inverse> + sum3 * tw7r3<T, N, inverse>; + const cvec<T, N> s2 = + w00 + sum1 * tw7r2<T, N, inverse> + sum2 * tw7r3<T, N, inverse> + sum3 * tw7r1<T, N, inverse>; + const cvec<T, N> s3 = + w00 + sum1 * tw7r3<T, N, inverse> + sum2 * tw7r1<T, N, inverse> + sum3 * tw7r2<T, N, inverse>; - const cvec<T, N> d1 = dif1 * tw7i1 + dif2 * tw7i2 + dif3 * tw7i3; - const cvec<T, N> d2 = dif1 * tw7i2 - dif2 * tw7i3 - dif3 * tw7i1; - const cvec<T, N> d3 = dif1 * tw7i3 - dif2 * tw7i1 + dif3 * tw7i2; + const cvec<T, N> d1 = + dif1 * tw7i1<T, N, inverse> + dif2 * tw7i2<T, N, inverse> + dif3 * tw7i3<T, N, inverse>; + const cvec<T, N> d2 = + dif1 * tw7i2<T, N, inverse> - dif2 * tw7i3<T, N, inverse> - dif3 * tw7i1<T, N, inverse>; + const cvec<T, N> d3 = + dif1 * tw7i3<T, N, inverse> - dif2 * tw7i1<T, N, inverse> + dif3 * tw7i2<T, N, inverse>; w01 = s1 + d1; w06 = s1 - d1; @@ -1120,29 +1190,131 @@ KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec< butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6); } +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i1 = + static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i2 = + static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i3 = + static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i4 = + static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i5 = + static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>(); + +template <size_t N, bool inverse = false, typename T> +KFR_INTRIN void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, + cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09, + cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, + cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06, + cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10) +{ + const cvec<T, N> sum1 = a01 + a10; + const cvec<T, N> dif1 = swap<2>(a01 - a10); + const cvec<T, N> sum2 = a02 + a09; + const cvec<T, N> dif2 = swap<2>(a02 - a09); + const cvec<T, N> sum3 = a03 + a08; + const cvec<T, N> dif3 = swap<2>(a03 - a08); + const cvec<T, N> sum4 = a04 + a07; + const cvec<T, N> dif4 = swap<2>(a04 - a07); + const cvec<T, N> sum5 = a05 + a06; + const cvec<T, N> dif5 = swap<2>(a05 - a06); + w00 = a00 + sum1 + sum2 + sum3 + sum4 + sum5; + + const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> + + sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> + + sum5 * tw11r5<T, N, inverse>; + const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> + + sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> + + sum5 * tw11r1<T, N, inverse>; + const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> + + sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> + + sum5 * tw11r2<T, N, inverse>; + const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> + + sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> + + sum5 * tw11r3<T, N, inverse>; + const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> + + sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> + + sum5 * tw11r4<T, N, inverse>; + + const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> + + dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> + + dif5 * tw11i5<T, N, inverse>; + const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> - + dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> - + dif5 * tw11i1<T, N, inverse>; + const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> + + dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> + + dif5 * tw11i2<T, N, inverse>; + const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> + + dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> - + dif5 * tw11i3<T, N, inverse>; + const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> + + dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> + + dif5 * tw11i4<T, N, inverse>; + + w01 = s1 + d1; + w10 = s1 - d1; + w02 = s2 + d2; + w09 = s2 - d2; + w03 = s3 + d3; + w08 = s3 - d3; + w04 = s4 + d4; + w07 = s4 - d4; + w05 = s5 + d5; + w06 = s5 - d5; +} + +template <typename T, size_t N, bool inverse> +const static cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718 - 1.0); +template <typename T, size_t N, bool inverse> +const static cvec<T, N> tw5i1 = + static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>(); +template <typename T, size_t N, bool inverse> +const static cvec<T, N> tw5r2 = static_cast<T>(-0.80901699437494742410229341718 - 1.0); +template <typename T, size_t N, bool inverse> +const static cvec<T, N> tw5i2 = + static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>(); + template <size_t N, bool inverse = false, typename T> KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) { - const static cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718 - 1.0); - const static cvec<T, N> tw5i1 = - static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>(); - const static cvec<T, N> tw5r2 = static_cast<T>(-0.80901699437494742410229341718 - 1.0); - const static cvec<T, N> tw5i2 = - static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>(); - const cvec<T, N> sum1 = a01 + a04; const cvec<T, N> dif1 = swap<2>(a01 - a04); const cvec<T, N> sum2 = a02 + a03; const cvec<T, N> dif2 = swap<2>(a02 - a03); w00 = a00 + sum1 + sum2; - const cvec<T, N> s1 = w00 + sum1 * tw5r1 + sum2 * tw5r2; - const cvec<T, N> s2 = w00 + sum1 * tw5r2 + sum2 * tw5r1; + const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse> + sum2 * tw5r2<T, N, inverse>; + const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse> + sum2 * tw5r1<T, N, inverse>; - const cvec<T, N> d1 = dif1 * tw5i1 + dif2 * tw5i2; - const cvec<T, N> d2 = dif1 * tw5i2 - dif2 * tw5i1; + const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse> + dif2 * tw5i2<T, N, inverse>; + const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse> - dif2 * tw5i1<T, N, inverse>; w01 = s1 + d1; w04 = s1 - d1; @@ -1245,6 +1417,16 @@ KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N template <bool inverse, typename T, size_t N> KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, + const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, vec<T, N>& out0, + vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, + vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8) +{ + butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4, + out5, out6, out7, out8); +} +template <bool inverse, typename T, size_t N> +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, @@ -1253,6 +1435,17 @@ KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9); } +template <bool inverse, typename T, size_t N> +KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2, + const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, + const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, + const vec<T, N>& in9, const vec<T, N>& in10, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, + vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10) +{ + butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2, + out3, out4, out5, out6, out7, out8, out9, out10); +} template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) { @@ -1360,15 +1553,17 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args) butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...); } -template <typename T, bool inverse, typename Tstride> -KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, size_t, cbool_t<inverse>, complex<T>*, const complex<T>*, - Tstride, size_t, size_t, const complex<T>*, size_t) +template <typename T, bool inverse, typename Tradix, typename Tstride> +KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, Tradix radix, cbool_t<inverse>, complex<T>*, + const complex<T>*, Tstride, size_t, size_t, const complex<T>*, size_t) { } -template <size_t width, bool inverse, typename T, typename Tstride> -KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<inverse>, complex<T>* out, - const complex<T>* in, Tstride ostride, size_t halfradix, - size_t halfradix_sqr, const complex<T>* twiddle, size_t i) + +template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix, + typename Thalfradixsqr, typename Tstride> +KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, Tstride ostride, Thalfradix halfradix, + Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i) { CMT_LOOP_NOUNROLL for (; i < halfradix / width * width; i += width) @@ -1377,7 +1572,6 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<in cvec<T, width> sum0 = resize<2 * width>(in0); cvec<T, width> sum1 = sum0; - CMT_LOOP_NOUNROLL for (size_t j = 0; j < halfradix; j++) { const cvec<T, 1> ina = cread<1>(in + (1 + j)); @@ -1442,6 +1636,36 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* } cwrite<1>(out, hcadd(sum) + sums); } + const auto halfradix = radix / 2; + const auto halfradix_sqr = halfradix * halfradix; + CMT_ASSUME(halfradix > 0); + size_t i = 0; + + generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, + halfradix * halfradix, twiddle, i); +} + +template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>> +KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, const complex<T>* twiddle, + Tstride ostride = Tstride{}) +{ + { + cvec<T, width> sum = T(); + size_t j = 0; + CMT_LOOP_UNROLL + for (; j < radix / width * width; j += width) + { + sum += cread<width>(in + j); + } + cvec<T, 1> sums = T(); + CMT_LOOP_UNROLL + for (; j < radix; j++) + { + sums += cread<1>(in + j); + } + cwrite<1>(out, hcadd(sum) + sums); + } const size_t halfradix = radix / 2; const size_t halfradix_sqr = halfradix * halfradix; CMT_ASSUME(halfradix > 0); @@ -1455,21 +1679,15 @@ template <typename T, bool inverse, typename Tstride = csize_t<1>> KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, complex<T>* temp, const complex<T>* twiddle, Tstride ostride = Tstride{}) { - if (out == in) - { - builtin_memcpy(temp, in, sizeof(complex<T>) * radix); - in = temp; - } constexpr size_t width = platform<T>::vector_width; - generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); - /*cswitch(csizes_t<11>(), radix, + cswitch(csizes_t<11, 13>(), radix, [&](auto radix_) CMT_INLINE_LAMBDA { - generic_butterfly_w<width>(decltype(radix_)(), cbool_t<inverse>(), out, in, twiddle, ostride); + spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride); }, [&]() CMT_INLINE_LAMBDA { generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); - });*/ + }); } template <typename T, size_t N> diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp @@ -191,13 +191,23 @@ struct test_case check(result, as_string(comparison.left), expr); } + void append_comment(const std::string& text) + { + comment += text; + if (show_progress) + { + println(); + println(text, ":"); + } + } + void set_comment(const std::string& text) { comment = text; if (show_progress) { println(); - println(comment, ":"); + println(text, ":"); } } @@ -344,6 +354,6 @@ void assert_is_same_decay() #define TEST TESTO_TEST #define DTEST TESTO_DTEST #endif -} +} // namespace testo CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -10,6 +10,7 @@ #include <kfr/dft.hpp> #include <kfr/dsp.hpp> #include <kfr/io.hpp> +#include <set> using namespace kfr; @@ -70,56 +71,71 @@ TEST(fft_accuracy) { testo::active_test()->show_progress = true; random_bit_generator gen(2247448713, 915890490, 864203735, 2982561); - - testo::matrix(named("type") = dft_float_types, // - named("inverse") = std::make_tuple(false, true), // - named("log2(size)") = make_range(size_t(1), stopsize), // - [&gen](auto type, bool inverse, size_t log2size) { - using float_type = type_of<decltype(type)>; - const size_t size = 1 << log2size; - - { - univector<complex<float_type>> in = - truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); - univector<complex<float_type>> out = in; - univector<complex<float_type>> refout = out; - const dft_plan<float_type> dft(size); - univector<u8> temp(dft.temp_size); - - reference_dft(refout.data(), in.data(), size, inverse); - dft.execute(out, out, temp, inverse); - - const float_type rms_diff = rms(cabs(refout - out)); - const double ops = log2size * 100; - const double epsilon = std::numeric_limits<float_type>::epsilon(); - CHECK(rms_diff < epsilon * ops); - } - - if (size >= 16) - { - univector<float_type> in = - truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); - - univector<complex<float_type>> out = truncate(scalar(qnan), size); - univector<complex<float_type>> refout = truncate(scalar(qnan), size); - const dft_plan_real<float_type> dft(size); - univector<u8> temp(dft.temp_size); - - reference_fft(refout.data(), in.data(), size); - dft.execute(out, in, temp); - const float_type rms_diff_r = - rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1))); - const double ops = log2size * 200; - const double epsilon = std::numeric_limits<float_type>::epsilon(); - CHECK(rms_diff_r < epsilon * ops); - - univector<float_type> out2(size, 0.f); - dft.execute(out2, out, temp); - out2 = out2 / size; - const float_type rms_diff_r2 = rms(in - out2); - CHECK(rms_diff_r2 < epsilon * ops); - } - }); + std::set<size_t> size_set; + univector<size_t> sizes = truncate(1 + counter(), stopsize - 1); + sizes = round(pow(2.0, sizes)); + +#ifndef KFR_DFT_NO_NPo2 + univector<size_t> sizes2 = truncate(2 + counter(), 1024); + for (size_t s : sizes2) + { + if (std::find(sizes.begin(), sizes.end(), s) == sizes.end()) + sizes.push_back(s); + } +#endif + println(sizes); + + testo::matrix( + named("type") = dft_float_types, // + named("size") = sizes, // + [&gen](auto type, size_t size) { + using float_type = type_of<decltype(type)>; + const double min_prec = 0.000001 * std::log(size) * size; + + for (bool inverse : { false, true }) + { + testo::active_test()->append_comment(inverse ? "complex-inverse" : "complex-direct"); + univector<complex<float_type>> in = + truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); + univector<complex<float_type>> out = in; + univector<complex<float_type>> refout = out; + univector<complex<float_type>> outo = in; + const dft_plan<float_type> dft(size); + univector<u8> temp(dft.temp_size); + + reference_dft(refout.data(), in.data(), size, inverse); + dft.execute(outo, in, temp, inverse); + dft.execute(out, out, temp, inverse); + + const float_type rms_diff_inplace = rms(cabs(refout - out)); + CHECK(rms_diff_inplace < min_prec); + const float_type rms_diff_outofplace = rms(cabs(refout - outo)); + CHECK(rms_diff_outofplace < min_prec); + } + + if (size >= 4 && is_poweroftwo(size)) + { + univector<float_type> in = truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size); + + univector<complex<float_type>> out = truncate(scalar(qnan), size); + univector<complex<float_type>> refout = truncate(scalar(qnan), size); + const dft_plan_real<float_type> dft(size); + univector<u8> temp(dft.temp_size); + + testo::active_test()->append_comment("real-direct"); + reference_fft(refout.data(), in.data(), size); + dft.execute(out, in, temp); + float_type rms_diff = rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1))); + CHECK(rms_diff < min_prec); + + univector<float_type> out2(size, 0.f); + testo::active_test()->append_comment("real-inverse"); + dft.execute(out2, out, temp); + out2 = out2 / size; + rms_diff = rms(in - out2); + CHECK(rms_diff < min_prec); + } + }); } #ifndef KFR_NO_MAIN @@ -127,6 +143,6 @@ int main() { println(library_version(), " running on ", cpu_runtime()); - return testo::run_all("", true); + return testo::run_all("", false); } #endif