Non-power of 2 DFT - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 194011494c612013f9dc8d75e96bd7fbd97e9b75
parent a92f4352d611566e7bbf3590d0768a9f444d8c14
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 27 Nov 2018 20:04:10 +0000

Non-power of 2 DFT

Diffstat:
M README.md  | 2 +-
M include/kfr/dft/dft-src.cpp  | 490 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M include/kfr/dft/fft.hpp  | 16 +++++++++++++---
M include/kfr/dft/ft.hpp  | 330 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M include/kfr/testo/testo.hpp  | 14 ++++++++++++--
M tests/dft_test.cpp  | 118 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------

6 files changed, 723 insertions(+), 247 deletions(-)
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ KFR is a header-only library (except DFT) and has no external dependencies.
 
 * Full AVX-512 support
 * EBU R128
-* Non-power of two DFT (coming soon)
+* Non-power of two DFT
 * TruePeak (coming soon)
 * Number of automatic tests has been increased
 * GPL version changed from 3 to 2+
diff --git a/include/kfr/dft/dft-src.cpp b/include/kfr/dft/dft-src.cpp
@@ -45,7 +45,7 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
 namespace kfr
 {
 
-constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 10> dft_radices{};
+constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{};
 
 #define DFT_ASSERT TESTO_ASSERT_INACTIVE
 
@@ -58,22 +58,30 @@ using cinvert_t = ctrue_t;
 template <typename T>
 struct dft_stage
 {
+    size_t radix      = 0;
     size_t stage_size = 0;
     size_t data_size  = 0;
     size_t temp_size  = 0;
     u8* data          = nullptr;
     size_t repeats    = 1;
     size_t out_offset = 0;
-    size_t width      = 0;
     size_t blocks     = 0;
-    const char* name;
-    bool recursion   = false;
-    bool can_inplace = true;
-    bool inplace     = false;
-    bool to_scratch  = false;
+    const char* name  = nullptr;
+    bool recursion    = false;
+    bool can_inplace  = true;
+    bool inplace      = false;
+    bool to_scratch   = false;
+    bool need_reorder = true;
 
     void initialize(size_t size) { do_initialize(size); }
 
+    virtual void dump() const
+    {
+        printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
+               stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
+               to_scratch);
+    }
+
     KFR_INTRIN void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
     {
         do_execute(cdirect_t(), out, in, temp);
@@ -472,7 +480,7 @@ template <typename T>
 static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
 {
     complex<T>* twiddle = ptr_cast<complex<T>>(stage->data);
-    const size_t N      = stage->repeats * stage->stage_size;
+    const size_t N      = stage->repeats * stage->radix;
     const size_t Nord   = stage->repeats;
     size_t i            = 0;
 
@@ -482,7 +490,7 @@ static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
         for (; i < Nord / width * width; i += width)
         {
             CMT_LOOP_NOUNROLL
-            for (size_t j = 1; j < stage->stage_size; j++)
+            for (size_t j = 1; j < stage->radix; j++)
             {
                 CMT_LOOP_NOUNROLL
                 for (size_t k = 0; k < width; k++)
@@ -502,16 +510,17 @@ struct dft_stage_fixed_impl : dft_stage<T>
 {
     dft_stage_fixed_impl(size_t radix_, size_t iterations, size_t blocks)
     {
-        this->stage_size = radix;
-        this->blocks     = blocks;
-        this->repeats    = iterations;
-        this->recursion  = false; // true;
+        this->name      = type_name<decltype(*this)>();
+        this->radix     = radix;
+        this->blocks    = blocks;
+        this->repeats   = iterations;
+        this->recursion = false; // true;
         this->data_size =
             align_up((this->repeats * (radix - 1)) * sizeof(complex<T>), platform<>::native_cache_alignment);
     }
 
-protected:
-    constexpr static size_t width = fft_vector_width<T>;
+    constexpr static size_t width =
+        radix >= 7 ? fft_vector_width<T> / 2 : radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2;
     virtual void do_initialize(size_t size) override final { dft_stage_fixed_initialize(this, width); }
 
     DFT_STAGE_FN
@@ -521,7 +530,7 @@ protected:
         const size_t Nord         = this->repeats;
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
 
-        const size_t N = Nord * this->stage_size;
+        const size_t N = Nord * this->radix;
         CMT_LOOP_NOUNROLL
         for (size_t b = 0; b < this->blocks; b++)
         {
@@ -537,13 +546,16 @@ struct dft_stage_fixed_final_impl : dft_stage<T>
 {
     dft_stage_fixed_final_impl(size_t radix_, size_t iterations, size_t blocks)
     {
-        this->stage_size  = radix;
+        this->name        = type_name<decltype(*this)>();
+        this->radix       = radix;
         this->blocks      = blocks;
         this->repeats     = iterations;
-        this->recursion   = false; // true;
+        this->recursion   = false;
         this->can_inplace = false;
     }
-    constexpr static size_t width = fft_vector_width<T>;
+    constexpr static size_t width =
+        radix >= 7 ? fft_vector_width<T> / 2 : radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2;
+
     DFT_STAGE_FN
     template <bool inverse>
     KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
@@ -555,19 +567,163 @@ struct dft_stage_fixed_final_impl : dft_stage<T>
     }
 };
 
+template <typename E>
+inline E& apply_conj(E& e, cfalse_t)
+{
+    return e;
+}
+
+template <typename E>
+inline auto apply_conj(E& e, ctrue_t)
+{
+    return cconj(e);
+}
+
+/// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1]
+template <typename E>
+struct fft_inverse : expression_base<E>
+{
+    using value_type = value_type_of<E>;
+
+    CMT_INLINE fft_inverse(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {}
+
+    CMT_INLINE vec<value_type, 1> operator()(cinput_t input, size_t index, vec_t<value_type, 1>) const
+    {
+        return this->argument_first(input, index == 0 ? 0 : this->size() - index, vec_t<value_type, 1>());
+    }
+
+    template <size_t N>
+    CMT_INLINE vec<value_type, N> operator()(cinput_t input, size_t index, vec_t<value_type, N>) const
+    {
+        if (index == 0)
+        {
+            return concat(
+                this->argument_first(input, index, vec_t<value_type, 1>()),
+                reverse(this->argument_first(input, this->size() - (N - 1), vec_t<value_type, N - 1>())));
+        }
+        return reverse(this->argument_first(input, this->size() - index - (N - 1), vec_t<value_type, N>()));
+    }
+};
+
+template <typename E>
+inline auto apply_fft_inverse(E&& e)
+{
+    return fft_inverse<E>(std::forward<E>(e));
+}
+
+template <typename T>
+struct dft_arblen_stage_impl : dft_stage<T>
+{
+    dft_arblen_stage_impl(size_t size)
+        : fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal), size(size)
+    {
+        this->name        = type_name<decltype(*this)>();
+        this->radix       = size;
+        this->blocks      = 1;
+        this->repeats     = 1;
+        this->recursion   = false;
+        this->can_inplace = false;
+        this->temp_size   = plan.temp_size;
+
+        chirp_ = render(cexp(sqr(linspace(T(1) - size, size - T(1), size * 2 - 1, true, true)) *
+                             complex<T>(0, -1) * c_pi<T> / size));
+
+        ichirpp_ = render(truncate(padded(1 / slice(chirp_, 0, 2 * size - 1)), fftsize));
+
+        univector<u8> temp(plan.temp_size);
+        plan.execute(ichirpp_, ichirpp_, temp);
+        xp.resize(fftsize, 0);
+        xp_fft.resize(fftsize);
+        invN2 = T(1) / fftsize;
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        const size_t n  = this->size;
+        const size_t N2 = this->fftsize;
+
+        auto&& chirp = apply_conj(chirp_, cbool<inverse>);
+
+        xp.slice(0, n) = make_univector(in, n) * slice(chirp, n - 1);
+
+        plan.execute(xp_fft.data(), xp.data(), temp);
+
+        if (inverse)
+            xp_fft = xp_fft * cconj(apply_fft_inverse(ichirpp_));
+        else
+            xp_fft = xp_fft * ichirpp_;
+        plan.execute(xp_fft.data(), xp_fft.data(), temp, ctrue);
+
+        make_univector(out, n) = xp_fft.slice(n - 1) * slice(chirp, n - 1) * invN2;
+    }
+
+    const size_t size;
+    const size_t fftsize;
+    T invN2;
+    dft_plan<T> plan;
+    univector<complex<T>> chirp_;
+    univector<complex<T>> ichirpp_;
+    univector<complex<T>> xp;
+    univector<complex<T>> xp_fft;
+};
+
+template <typename T, size_t radix1, size_t radix2, size_t size = radix1* radix2>
+struct dft_special_stage_impl : dft_stage<T>
+{
+    dft_special_stage_impl() : stage1(radix1, size / radix1, 1), stage2(radix2, 1, size / radix2)
+    {
+        this->name        = type_name<decltype(*this)>();
+        this->radix       = size;
+        this->blocks      = 1;
+        this->repeats     = 1;
+        this->recursion   = false;
+        this->can_inplace = false;
+        this->temp_size   = stage1.temp_size + stage2.temp_size + sizeof(complex<T>) * size;
+        this->data_size   = stage1.data_size + stage2.data_size;
+    }
+    void dump() const override
+    {
+        dft_stage<T>::dump();
+        printf("    ");
+        stage1.dump();
+        printf("    ");
+        stage2.dump();
+    }
+    void do_initialize(size_t stage_size) override
+    {
+        stage1.data = this->data;
+        stage2.data = this->data + stage1.data_size;
+        stage1.initialize(stage_size);
+        stage2.initialize(stage_size);
+    }
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size);
+        stage1.do_execute(cbool<inverse>, scratch, in, temp);
+        stage2.do_execute(cbool<inverse>, out, scratch, temp + stage1.temp_size);
+    }
+    dft_stage_fixed_impl<T, radix1> stage1;
+    dft_stage_fixed_final_impl<T, radix2> stage2;
+};
+
 template <typename T, bool final>
 struct dft_stage_generic_impl : dft_stage<T>
 {
     dft_stage_generic_impl(size_t radix, size_t iterations, size_t blocks)
     {
-        this->stage_size  = radix;
+        this->name        = type_name<decltype(*this)>();
+        this->radix       = radix;
         this->blocks      = blocks;
         this->repeats     = iterations;
         this->recursion   = false; // true;
         this->can_inplace = false;
         this->temp_size   = align_up(sizeof(complex<T>) * radix, platform<>::native_cache_alignment);
         this->data_size =
-            align_up(sizeof(complex<T>) * sqr(this->stage_size / 2), platform<>::native_cache_alignment);
+            align_up(sizeof(complex<T>) * sqr(this->radix / 2), platform<>::native_cache_alignment);
     }
 
 protected:
@@ -575,13 +731,12 @@ protected:
     {
         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         CMT_LOOP_NOUNROLL
-        for (size_t i = 0; i < this->stage_size / 2; i++)
+        for (size_t i = 0; i < this->radix / 2; i++)
         {
             CMT_LOOP_NOUNROLL
-            for (size_t j = 0; j < this->stage_size / 2; j++)
+            for (size_t j = 0; j < this->radix / 2; j++)
             {
-                cwrite<1>(twiddle++,
-                          cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->stage_size)));
+                cwrite<1>(twiddle++, cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->radix)));
             }
         }
     }
@@ -593,17 +748,17 @@ protected:
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         const size_t bl           = this->blocks;
         const size_t Nord         = this->repeats;
-        const size_t N            = Nord * this->stage_size;
+        const size_t N            = Nord * this->radix;
 
         CMT_LOOP_NOUNROLL
         for (size_t b = 0; b < bl; b++)
-            generic_butterfly(this->stage_size, cbool<inverse>, out + b, in + b * this->stage_size,
+            generic_butterfly(this->radix, cbool<inverse>, out + b, in + b * this->radix,
                               ptr_cast<complex<T>>(temp), twiddle, bl);
     }
 };
 
 template <typename T, typename Tr2>
-inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 r2)
+inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 first_radix)
 {
     CMT_ASSUME(r0 > 1);
     CMT_ASSUME(r1 > 1);
@@ -617,20 +772,20 @@ inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t
         {
             const complex<T>* in2 = in1;
             CMT_LOOP_UNROLL
-            for (size_t j = 0; j < r2; j++)
+            for (size_t j = 0; j < first_radix; j++)
             {
                 *out++ = *in2;
                 in2 += r1;
             }
             in1++;
-            in += r2;
+            in += first_radix;
         }
     }
 }
 
-template <typename T>
+template <typename T, typename Tr2>
 inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_t* radices, size_t count,
-                             size_t index, size_t inscale, size_t inner_size)
+                             size_t index, size_t inscale, size_t inner_size, Tr2 first_radix)
 {
     const bool b       = index == 1;
     const size_t radix = radices[index];
@@ -641,7 +796,7 @@ inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_
         {
             const complex<T>* in1 = in;
             CMT_LOOP_UNROLL
-            for (size_t j = 0; j < radices[0]; j++)
+            for (size_t j = 0; j < first_radix; j++)
             {
                 *out++ = *in1;
                 in1 += inner_size;
@@ -656,7 +811,7 @@ inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_
         CMT_LOOP_NOUNROLL
         for (size_t i = 0; i < steps; i++)
         {
-            dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size);
+            dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size, first_radix);
             in += inscale;
         }
     }
@@ -667,15 +822,15 @@ struct dft_reorder_stage_impl : dft_stage<T>
 {
     dft_reorder_stage_impl(const int* radices, size_t count) : count(count)
     {
+        this->name        = type_name<decltype(*this)>();
         this->can_inplace = false;
         this->data_size   = 0;
-        std::copy(std::make_reverse_iterator(radices + count), std::make_reverse_iterator(radices),
-                  this->radices);
+        std::copy(radices, radices + count, this->radices);
         this->inner_size = 1;
         this->size       = 1;
         for (size_t r = 0; r < count; r++)
         {
-            if (r != 0 && r != count - 2)
+            if (r != 0 && r != count - 1)
                 this->inner_size *= radices[r];
             this->size *= radices[r];
         }
@@ -692,38 +847,37 @@ protected:
     template <bool inverse>
     KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
-        //        std::copy(in, in + this->size, out);
-        //        return;
-        if (count == 3)
-        {
-            dft_permute(out, in, radices[2], radices[1], radices[0]);
-        }
-        else
-        {
-            const size_t rlast = radices[count - 1];
-            for (size_t p = 0; p < rlast; p++)
-            {
-                dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size);
-                in += size / rlast;
-            }
-        }
-        //        if (count == 1)
-        //        {
-        //            cswitch(dft_radices, radices[0], [&](auto rfirst) { dft_permute3(out, in, 1, 1, rfirst);
-        //            },
-        //                    [&]() { dft_permute3(out, in, 1, 1, radices[0]); });
-        //        }
-
-        //        const size_t rlast = radices[count - 1];
-        //        const size_t size  = this->size / rlast;
-        //
-        //        cswitch(dft_radices, radices[0], [&](auto rfirst) {
-        //            for (size_t p = 0; p < rlast; p++)
-        //            {
-        //                reorder_impl(out, in, index, 1, rfirst);
-        //                in += size;
-        //            }
-        //        });
+        cswitch(dft_radices, radices[0],
+                [&](auto first_radix) {
+                    if (count == 3)
+                    {
+                        dft_permute(out, in, radices[2], radices[1], first_radix);
+                    }
+                    else
+                    {
+                        const size_t rlast = radices[count - 1];
+                        for (size_t p = 0; p < rlast; p++)
+                        {
+                            dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, first_radix);
+                            in += size / rlast;
+                        }
+                    }
+                },
+                [&]() {
+                    if (count == 3)
+                    {
+                        dft_permute(out, in, radices[2], radices[1], radices[0]);
+                    }
+                    else
+                    {
+                        const size_t rlast = radices[count - 1];
+                        for (size_t p = 0; p < rlast; p++)
+                        {
+                            dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, radices[0]);
+                            in += size / rlast;
+                        }
+                    }
+                });
     }
 };
 
@@ -732,6 +886,8 @@ struct fft_stage_impl : dft_stage<T>
 {
     fft_stage_impl(size_t stage_size)
     {
+        this->name       = type_name<decltype(*this)>();
+        this->radix      = 4;
         this->stage_size = stage_size;
         this->repeats    = 4;
         this->recursion  = true;
@@ -770,6 +926,8 @@ struct fft_final_stage_impl : dft_stage<T>
 {
     fft_final_stage_impl(size_t)
     {
+        this->name       = type_name<decltype(*this)>();
+        this->radix      = size;
         this->stage_size = size;
         this->out_offset = size;
         this->repeats    = 4;
@@ -863,6 +1021,7 @@ struct fft_reorder_stage_impl : dft_stage<T>
 {
     fft_reorder_stage_impl(size_t stage_size)
     {
+        this->name       = type_name<decltype(*this)>();
         this->stage_size = stage_size;
         log2n            = ilog2(stage_size);
         this->data_size  = 0;
@@ -887,7 +1046,7 @@ struct fft_specialization;
 template <typename T>
 struct fft_specialization<T, 1> : dft_stage<T>
 {
-    fft_specialization(size_t) {}
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
 
 protected:
     constexpr static bool aligned = false;
@@ -905,7 +1064,7 @@ protected:
 template <typename T>
 struct fft_specialization<T, 2> : dft_stage<T>
 {
-    fft_specialization(size_t) {}
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
 
 protected:
     constexpr static bool aligned = false;
@@ -923,7 +1082,7 @@ protected:
 template <typename T>
 struct fft_specialization<T, 3> : dft_stage<T>
 {
-    fft_specialization(size_t) {}
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
 
 protected:
     constexpr static bool aligned = false;
@@ -940,7 +1099,7 @@ protected:
 template <typename T>
 struct fft_specialization<T, 4> : dft_stage<T>
 {
-    fft_specialization(size_t) {}
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
 
 protected:
     constexpr static bool aligned = false;
@@ -957,7 +1116,7 @@ protected:
 template <typename T>
 struct fft_specialization<T, 5> : dft_stage<T>
 {
-    fft_specialization(size_t) {}
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
 
 protected:
     constexpr static bool aligned = false;
@@ -974,7 +1133,7 @@ protected:
 template <typename T>
 struct fft_specialization<T, 6> : dft_stage<T>
 {
-    fft_specialization(size_t) {}
+    fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
 
 protected:
     constexpr static bool aligned = false;
@@ -991,6 +1150,7 @@ struct fft_specialization<T, 7> : dft_stage<T>
 {
     fft_specialization(size_t)
     {
+        this->name       = type_name<decltype(*this)>();
         this->stage_size = 128;
         this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
     }
@@ -1018,7 +1178,8 @@ protected:
     {
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         final_pass<inverse>(csize_t<final_size>(), out, in, twiddle);
-        fft_reorder(out, csize_t<7>());
+        if (this->need_reorder)
+            fft_reorder(out, csize_t<7>());
     }
 
     template <bool inverse>
@@ -1045,7 +1206,11 @@ protected:
 template <>
 struct fft_specialization<float, 8> : dft_stage<float>
 {
-    fft_specialization(size_t) { this->temp_size = sizeof(complex<float>) * 256; }
+    fft_specialization(size_t)
+    {
+        this->name      = type_name<decltype(*this)>();
+        this->temp_size = sizeof(complex<float>) * 256;
+    }
 
 protected:
     using T = float;
@@ -1085,42 +1250,54 @@ template <>
 struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
 {
     using T = double;
-    using fft_final_stage_impl<double, false, 256>::fft_final_stage_impl;
+    fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
+    {
+        this->name = type_name<decltype(*this)>();
+    }
 
     DFT_STAGE_FN
     template <bool inverse>
     KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
-        fft_reorder(out, csize_t<8>());
+        if (this->need_reorder)
+            fft_reorder(out, csize_t<8>());
     }
 };
 
 template <typename T>
 struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
 {
-    using fft_final_stage_impl<T, false, 512>::fft_final_stage_impl;
+    fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
+    {
+        this->name = type_name<decltype(*this)>();
+    }
 
     DFT_STAGE_FN
     template <bool inverse>
     KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
-        fft_reorder(out, csize_t<9>());
+        if (this->need_reorder)
+            fft_reorder(out, csize_t<9>());
     }
 };
 
 template <typename T>
 struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
 {
-    using fft_final_stage_impl<T, false, 1024>::fft_final_stage_impl;
+    fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
+    {
+        this->name = type_name<decltype(*this)>();
+    }
 
     DFT_STAGE_FN
     template <bool inverse>
     KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
-        fft_reorder(out, 10, cfalse);
+        if (this->need_reorder)
+            fft_reorder(out, 10, cfalse);
     }
 };
 
@@ -1133,7 +1310,7 @@ template <typename Stage, typename... Args>
 void dft_plan<T>::add_stage(Args... args)
 {
     dft_stage<T>* stage = new Stage(args...);
-    stage->name         = nullptr;
+    stage->need_reorder = need_reorder;
     this->data_size += stage->data_size;
     this->temp_size += stage->temp_size;
     stages.push_back(dft_stage_ptr(stage));
@@ -1207,12 +1384,13 @@ void dft_plan<T>::initialize()
         offset += stage->data_size;
     }
 
-    bool to_scratch = false;
+    bool to_scratch     = false;
+    bool scratch_needed = false;
     for (dft_stage_ptr& stage : reversed(stages))
     {
         if (to_scratch)
         {
-            this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
+            scratch_needed = true;
         }
         stage->to_scratch = to_scratch;
         if (!stage->can_inplace)
@@ -1220,14 +1398,16 @@ void dft_plan<T>::initialize()
             to_scratch = !to_scratch;
         }
     }
+    if (scratch_needed || !stages[0]->can_inplace)
+        this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
 }
 
 template <typename T>
 const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
-                                         const complex<T>* scratch) const
+                                         const complex<T>* scratch, bool in_scratch) const
 {
     if (stage == 0)
-        return in;
+        return in_scratch ? scratch : in;
     return stages[stage - 1]->to_scratch ? scratch : out;
 }
 
@@ -1241,12 +1421,22 @@ template <typename T>
 template <bool inverse>
 void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
 {
+    if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+    {
+        return stages[0]->execute(cbool<inverse>, out, in, temp);
+    }
     size_t stack[32] = { 0 };
 
     complex<T>* scratch =
         ptr_cast<complex<T>>(temp + this->temp_size -
                              align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
 
+    bool in_scratch = !stages[0]->can_inplace && in == out;
+    if (in_scratch)
+    {
+        internal::builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
+    }
+
     const size_t count = stages.size();
 
     for (size_t depth = 0; depth < count;)
@@ -1266,7 +1456,7 @@ void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T
                 else
                 {
                     complex<T>* rout      = select_out(rdepth, out, scratch);
-                    const complex<T>* rin = select_in(rdepth, out, in, scratch);
+                    const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
                     stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
                     offset += stages[rdepth]->out_offset;
                     stack[rdepth]++;
@@ -1281,15 +1471,16 @@ void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T
         else
         {
             stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
-                                   select_in(depth, out, in, scratch), temp);
+                                   select_in(depth, out, in, scratch, in_scratch), temp);
             depth++;
         }
     }
 }
 
 template <typename T>
-dft_plan<T>::dft_plan(size_t size, dft_type) : size(size), temp_size(0), data_size(0)
+dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
 {
+    need_reorder = true;
     if (is_poweroftwo(size))
     {
         const size_t log2n = ilog2(size);
@@ -1303,62 +1494,82 @@ dft_plan<T>::dft_plan(size_t size, dft_type) : size(size), temp_size(0), data_si
                     cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
                         this->make_fft(size, is_even, ctrue);
                         constexpr size_t is_evenv = val_of(decltype(is_even)());
-                        this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size);
+                        if (need_reorder)
+                            this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size);
                     });
                 });
     }
+#ifndef KFR_DFT_NO_NPo2
     else
     {
-        size_t cur_size                = size;
-        constexpr size_t radices_count = dft_radices.back() + 1;
-        u8 count[radices_count]        = { 0 };
-        int radices[32]                = { 0 };
-        size_t radices_size            = 0;
+        if (size == 60)
+        {
+            this->add_stage<internal::dft_special_stage_impl<T, 6, 10>>();
+        }
+        else if (size == 48)
+        {
+            this->add_stage<internal::dft_special_stage_impl<T, 6, 8>>();
+        }
+        else
+        {
+            size_t cur_size                = size;
+            constexpr size_t radices_count = dft_radices.back() + 1;
+            u8 count[radices_count]        = { 0 };
+            int radices[32]                = { 0 };
+            size_t radices_size            = 0;
+
+            cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
+                while (cur_size && cur_size % val_of(radix) == 0)
+                {
+                    count[val_of(radix)]++;
+                    cur_size /= val_of(radix);
+                }
+            });
 
-        cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
-            while (cur_size && cur_size % val_of(radix) == 0)
+            if (cur_size >= 101)
             {
-                count[val_of(radix)]++;
-                cur_size /= val_of(radix);
+                this->add_stage<internal::dft_arblen_stage_impl<T>>(size);
             }
-        });
+            else
+            {
+                size_t blocks     = 1;
+                size_t iterations = size;
 
-        size_t blocks     = 1;
-        size_t iterations = size;
+                for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
+                {
+                    for (size_t i = 0; i < count[r]; i++)
+                    {
+                        iterations /= r;
+                        radices[radices_size++] = r;
+                        if (iterations == 1)
+                            this->prepare_dft_stage(r, iterations, blocks, ctrue);
+                        else
+                            this->prepare_dft_stage(r, iterations, blocks, cfalse);
+                        blocks *= r;
+                    }
+                }
 
-        for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
-        {
-            for (size_t i = 0; i < count[r]; i++)
-            {
-                iterations /= r;
-                radices[radices_size++] = r;
-                if (iterations == 1)
-                    this->prepare_dft_stage(r, iterations, blocks, ctrue);
-                else
-                    this->prepare_dft_stage(r, iterations, blocks, cfalse);
-                blocks *= r;
-            }
-        }
+                if (cur_size > 1)
+                {
+                    iterations /= cur_size;
+                    radices[radices_size++] = cur_size;
+                    if (iterations == 1)
+                        this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+                    else
+                        this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+                }
 
-        if (cur_size > 1)
-        {
-            iterations /= cur_size;
-            radices[radices_size++] = cur_size;
-            if (iterations == 1)
-                this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
-            else
-                this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+                if (stages.size() > 2)
+                    this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size);
+            }
         }
-
-        if (stages.size() > 2)
-            this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size);
     }
+#endif
     initialize();
 }
 
 template <typename T>
-dft_plan_real<T>::dft_plan_real(size_t size, dft_type type)
-    : dft_plan<T>(size / 2, type), size(size), rtwiddle(size / 4)
+dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
 {
     using namespace internal;
 
@@ -1463,6 +1674,15 @@ dft_plan<T>::~dft_plan()
 {
 }
 
+template <typename T>
+void dft_plan<T>::dump() const
+{
+    for (const dft_stage_ptr& s : stages)
+    {
+        s->dump();
+    }
+}
+
 namespace internal
 {
 
@@ -1621,24 +1841,26 @@ template void convolve_filter<double>::set_data(const univector<double>&);
 template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
 template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
 
-template dft_plan<float>::dft_plan(size_t, dft_type);
+template dft_plan<float>::dft_plan(size_t, dft_order);
 template dft_plan<float>::~dft_plan();
+template void dft_plan<float>::dump() const;
 template void dft_plan<float>::execute_dft(cometa::cbool_t<false>, kfr::complex<float>* out,
                                            const kfr::complex<float>* in, kfr::u8* temp) const;
 template void dft_plan<float>::execute_dft(cometa::cbool_t<true>, kfr::complex<float>* out,
                                            const kfr::complex<float>* in, kfr::u8* temp) const;
-template dft_plan_real<float>::dft_plan_real(size_t, dft_type);
+template dft_plan_real<float>::dft_plan_real(size_t);
 template void dft_plan_real<float>::from_fmt(kfr::complex<float>* out, const kfr::complex<float>* in,
                                              kfr::dft_pack_format fmt) const;
 template void dft_plan_real<float>::to_fmt(kfr::complex<float>* out, kfr::dft_pack_format fmt) const;
 
-template dft_plan<double>::dft_plan(size_t, dft_type);
+template dft_plan<double>::dft_plan(size_t, dft_order);
 template dft_plan<double>::~dft_plan();
+template void dft_plan<double>::dump() const;
 template void dft_plan<double>::execute_dft(cometa::cbool_t<false>, kfr::complex<double>* out,
                                             const kfr::complex<double>* in, kfr::u8* temp) const;
 template void dft_plan<double>::execute_dft(cometa::cbool_t<true>, kfr::complex<double>* out,
                                             const kfr::complex<double>* in, kfr::u8* temp) const;
-template dft_plan_real<double>::dft_plan_real(size_t, dft_type);
+template dft_plan_real<double>::dft_plan_real(size_t);
 template void dft_plan_real<double>::from_fmt(kfr::complex<double>* out, const kfr::complex<double>* in,
                                               kfr::dft_pack_format fmt) const;
 template void dft_plan_real<double>::to_fmt(kfr::complex<double>* out, kfr::dft_pack_format fmt) const;
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -54,6 +54,12 @@ enum class dft_type
     inverse
 };
 
+enum class dft_order
+{
+    normal,
+    internal, // possibly bit/digit-reversed, implementation-defined, faster
+};
+
 template <typename T>
 struct dft_stage;
 
@@ -65,7 +71,9 @@ struct dft_plan
     size_t size;
     size_t temp_size;
 
-    dft_plan(size_t size, dft_type = dft_type::both);
+    dft_plan(size_t size, dft_order order = dft_order::normal);
+
+    void dump() const;
 
     KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const
     {
@@ -101,6 +109,7 @@ protected:
     autofree<u8> data;
     size_t data_size;
     std::vector<dft_stage_ptr> stages;
+    bool need_reorder;
 
     template <typename Stage, typename... Args>
     void add_stage(Args... args);
@@ -115,7 +124,8 @@ protected:
     template <bool inverse>
     void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const;
 
-    const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, const complex<T>* scratch) const;
+    const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
+                                const complex<T>* scratch, bool in_scratch) const;
     complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const;
 };
 
@@ -129,7 +139,7 @@ template <typename T>
 struct dft_plan_real : dft_plan<T>
 {
     size_t size;
-    dft_plan_real(size_t size, dft_type = dft_type::both);
+    dft_plan_real(size_t size);
 
     void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete;
 
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -542,8 +542,10 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
 template <size_t N, typename T>
 KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
 {
-    w0 = a0 + a1;
-    w1 = a0 - a1;
+    const cvec<T, N> sum = a0 + a1;
+    const cvec<T, N> dif = a0 - a1;
+    w0                   = sum;
+    w1                   = dif;
 }
 
 template <size_t N, typename T>
@@ -1016,21 +1018,25 @@ KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1)
     a1 = cmul(a1, tw1);
 }
 
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw3r1 = static_cast<T>(-0.5 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw3i1 =
+    static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
+
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
-                           cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02)
+KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01,
+                           cvec<T, N>& w02)
 {
-    const static cvec<T, N> tw3r1 = static_cast<T>(-0.5 - 1.0);
-    const static cvec<T, N> tw3i1 =
-        static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
 
     const cvec<T, N> sum1 = a01 + a02;
     const cvec<T, N> dif1 = swap<2>(a01 - a02);
     w00                   = a00 + sum1;
 
-    const cvec<T, N> s1 = w00 + sum1 * tw3r1;
+    const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>;
 
-    const cvec<T, N> d1 = dif1 * tw3i1;
+    const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>;
 
     w01 = s1 + d1;
     w02 = s1 - d1;
@@ -1073,22 +1079,80 @@ KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<
     butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5);
 }
 
+template <typename T, bool inverse = false>
+const static cvec<T, 1> tw9_1 = { T(0.76604444311897803520239265055541),
+                                  (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) };
+template <typename T, bool inverse = false>
+const static cvec<T, 1> tw9_2 = { T(0.17364817766693034885171662676931),
+                                  (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) };
+template <typename T, bool inverse = false>
+const static cvec<T, 1> tw9_4 = { T(-0.93969262078590838405410927732473),
+                                  (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) };
+
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
-                           const cvec<T, N>& a03, const cvec<T, N>& a04, const cvec<T, N>& a05,
-                           const cvec<T, N>& a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
-                           cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
+KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+                           const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+                           const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, cvec<T, N>& w0,
+                           cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5,
+                           cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
+{
+    cvec<T, N* 3> a012 = concat(a0, a1, a2);
+    cvec<T, N* 3> a345 = concat(a3, a4, a5);
+    cvec<T, N* 3> a678 = concat(a6, a7, a8);
+    butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678);
+    cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8;
+    split(a012, t0, t1, t2);
+    split(a345, t3, t4, t5);
+    split(a678, t6, t7, t8);
+
+    t4 = cmul(t4, tw9_1<T, inverse>);
+    t5 = cmul(t5, tw9_2<T, inverse>);
+    t7 = cmul(t7, tw9_2<T, inverse>);
+    t8 = cmul(t8, tw9_4<T, inverse>);
+
+    cvec<T, N* 3> t036 = concat(t0, t3, t6);
+    cvec<T, N* 3> t147 = concat(t1, t4, t7);
+    cvec<T, N* 3> t258 = concat(t2, t5, t8);
+
+    butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258);
+    split(t036, w0, w1, w2);
+    split(t147, w3, w4, w5);
+    split(t258, w6, w7, w8);
+}
+
+template <size_t N, bool inverse = false, typename T>
+KFR_INTRIN void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+                           cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
 {
-    const static cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884 - 1.0);
-    const static cvec<T, N> tw7i1 =
-        static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>();
-    const static cvec<T, N> tw7r2 = static_cast<T>(-0.2225209339563144042889025645 - 1.0);
-    const static cvec<T, N> tw7i2 =
-        static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>();
-    const static cvec<T, N> tw7r3 = static_cast<T>(-0.90096886790241912623610231951 - 1.0);
-    const static cvec<T, N> tw7i3 =
-        static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
+    butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8);
+}
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7i1 =
+    static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7r2 = static_cast<T>(-0.2225209339563144042889025645 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7i2 =
+    static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7r3 = static_cast<T>(-0.90096886790241912623610231951 - 1.0);
 
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7i3 =
+    static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
+
+template <size_t N, bool inverse = false, typename T>
+KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+                           cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+                           cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
+{
     const cvec<T, N> sum1 = a01 + a06;
     const cvec<T, N> dif1 = swap<2>(a01 - a06);
     const cvec<T, N> sum2 = a02 + a05;
@@ -1097,13 +1161,19 @@ KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const c
     const cvec<T, N> dif3 = swap<2>(a03 - a04);
     w00                   = a00 + sum1 + sum2 + sum3;
 
-    const cvec<T, N> s1 = w00 + sum1 * tw7r1 + sum2 * tw7r2 + sum3 * tw7r3;
-    const cvec<T, N> s2 = w00 + sum1 * tw7r2 + sum2 * tw7r3 + sum3 * tw7r1;
-    const cvec<T, N> s3 = w00 + sum1 * tw7r3 + sum2 * tw7r1 + sum3 * tw7r2;
+    const cvec<T, N> s1 =
+        w00 + sum1 * tw7r1<T, N, inverse> + sum2 * tw7r2<T, N, inverse> + sum3 * tw7r3<T, N, inverse>;
+    const cvec<T, N> s2 =
+        w00 + sum1 * tw7r2<T, N, inverse> + sum2 * tw7r3<T, N, inverse> + sum3 * tw7r1<T, N, inverse>;
+    const cvec<T, N> s3 =
+        w00 + sum1 * tw7r3<T, N, inverse> + sum2 * tw7r1<T, N, inverse> + sum3 * tw7r2<T, N, inverse>;
 
-    const cvec<T, N> d1 = dif1 * tw7i1 + dif2 * tw7i2 + dif3 * tw7i3;
-    const cvec<T, N> d2 = dif1 * tw7i2 - dif2 * tw7i3 - dif3 * tw7i1;
-    const cvec<T, N> d3 = dif1 * tw7i3 - dif2 * tw7i1 + dif3 * tw7i2;
+    const cvec<T, N> d1 =
+        dif1 * tw7i1<T, N, inverse> + dif2 * tw7i2<T, N, inverse> + dif3 * tw7i3<T, N, inverse>;
+    const cvec<T, N> d2 =
+        dif1 * tw7i2<T, N, inverse> - dif2 * tw7i3<T, N, inverse> - dif3 * tw7i1<T, N, inverse>;
+    const cvec<T, N> d3 =
+        dif1 * tw7i3<T, N, inverse> - dif2 * tw7i1<T, N, inverse> + dif3 * tw7i2<T, N, inverse>;
 
     w01 = s1 + d1;
     w06 = s1 - d1;
@@ -1120,29 +1190,131 @@ KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<
     butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6);
 }
 
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i1 =
+    static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i2 =
+    static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i3 =
+    static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i4 =
+    static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i5 =
+    static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>();
+
+template <size_t N, bool inverse = false, typename T>
+KFR_INTRIN void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+                            cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
+                            cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+                            cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
+                            cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
+{
+    const cvec<T, N> sum1 = a01 + a10;
+    const cvec<T, N> dif1 = swap<2>(a01 - a10);
+    const cvec<T, N> sum2 = a02 + a09;
+    const cvec<T, N> dif2 = swap<2>(a02 - a09);
+    const cvec<T, N> sum3 = a03 + a08;
+    const cvec<T, N> dif3 = swap<2>(a03 - a08);
+    const cvec<T, N> sum4 = a04 + a07;
+    const cvec<T, N> dif4 = swap<2>(a04 - a07);
+    const cvec<T, N> sum5 = a05 + a06;
+    const cvec<T, N> dif5 = swap<2>(a05 - a06);
+    w00                   = a00 + sum1 + sum2 + sum3 + sum4 + sum5;
+
+    const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> +
+                          sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> +
+                          sum5 * tw11r5<T, N, inverse>;
+    const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> +
+                          sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> +
+                          sum5 * tw11r1<T, N, inverse>;
+    const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> +
+                          sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> +
+                          sum5 * tw11r2<T, N, inverse>;
+    const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> +
+                          sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> +
+                          sum5 * tw11r3<T, N, inverse>;
+    const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> +
+                          sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> +
+                          sum5 * tw11r4<T, N, inverse>;
+
+    const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> +
+                          dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> +
+                          dif5 * tw11i5<T, N, inverse>;
+    const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> -
+                          dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> -
+                          dif5 * tw11i1<T, N, inverse>;
+    const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> +
+                          dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> +
+                          dif5 * tw11i2<T, N, inverse>;
+    const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> +
+                          dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> -
+                          dif5 * tw11i3<T, N, inverse>;
+    const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> +
+                          dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> +
+                          dif5 * tw11i4<T, N, inverse>;
+
+    w01 = s1 + d1;
+    w10 = s1 - d1;
+    w02 = s2 + d2;
+    w09 = s2 - d2;
+    w03 = s3 + d3;
+    w08 = s3 - d3;
+    w04 = s4 + d4;
+    w07 = s4 - d4;
+    w05 = s5 + d5;
+    w06 = s5 - d5;
+}
+
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718 - 1.0);
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5i1 =
+    static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>();
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5r2 = static_cast<T>(-0.80901699437494742410229341718 - 1.0);
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5i2 =
+    static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
+
 template <size_t N, bool inverse = false, typename T>
 KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
                            const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
                            cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
 {
-    const static cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718 - 1.0);
-    const static cvec<T, N> tw5i1 =
-        static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>();
-    const static cvec<T, N> tw5r2 = static_cast<T>(-0.80901699437494742410229341718 - 1.0);
-    const static cvec<T, N> tw5i2 =
-        static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
-
     const cvec<T, N> sum1 = a01 + a04;
     const cvec<T, N> dif1 = swap<2>(a01 - a04);
     const cvec<T, N> sum2 = a02 + a03;
     const cvec<T, N> dif2 = swap<2>(a02 - a03);
     w00                   = a00 + sum1 + sum2;
 
-    const cvec<T, N> s1 = w00 + sum1 * tw5r1 + sum2 * tw5r2;
-    const cvec<T, N> s2 = w00 + sum1 * tw5r2 + sum2 * tw5r1;
+    const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse> + sum2 * tw5r2<T, N, inverse>;
+    const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse> + sum2 * tw5r1<T, N, inverse>;
 
-    const cvec<T, N> d1 = dif1 * tw5i1 + dif2 * tw5i2;
-    const cvec<T, N> d2 = dif1 * tw5i2 - dif2 * tw5i1;
+    const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse> + dif2 * tw5i2<T, N, inverse>;
+    const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse> - dif2 * tw5i1<T, N, inverse>;
 
     w01 = s1 + d1;
     w04 = s1 - d1;
@@ -1245,6 +1417,16 @@ KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N
 template <bool inverse, typename T, size_t N>
 KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
                           const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+                          const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, vec<T, N>& out0,
+                          vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
+                          vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8)
+{
+    butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4,
+                               out5, out6, out7, out8);
+}
+template <bool inverse, typename T, size_t N>
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
                           const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
                           const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
                           vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
@@ -1253,6 +1435,17 @@ KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N
     butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3,
                                 out4, out5, out6, out7, out8, out9);
 }
+template <bool inverse, typename T, size_t N>
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+                          const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+                          const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
+                          const vec<T, N>& in9, const vec<T, N>& in10, vec<T, N>& out0, vec<T, N>& out1,
+                          vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
+                          vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
+{
+    butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2,
+                                out3, out4, out5, out6, out7, out8, out9, out10);
+}
 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
 KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
 {
@@ -1360,15 +1553,17 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
     butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...);
 }
 
-template <typename T, bool inverse, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, size_t, cbool_t<inverse>, complex<T>*, const complex<T>*,
-                                        Tstride, size_t, size_t, const complex<T>*, size_t)
+template <typename T, bool inverse, typename Tradix, typename Tstride>
+KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, Tradix radix, cbool_t<inverse>, complex<T>*,
+                                        const complex<T>*, Tstride, size_t, size_t, const complex<T>*, size_t)
 {
 }
-template <size_t width, bool inverse, typename T, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<inverse>, complex<T>* out,
-                                        const complex<T>* in, Tstride ostride, size_t halfradix,
-                                        size_t halfradix_sqr, const complex<T>* twiddle, size_t i)
+
+template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix,
+          typename Thalfradixsqr, typename Tstride>
+KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
+                                        const complex<T>* in, Tstride ostride, Thalfradix halfradix,
+                                        Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
 {
     CMT_LOOP_NOUNROLL
     for (; i < halfradix / width * width; i += width)
@@ -1377,7 +1572,6 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<in
         cvec<T, width> sum0  = resize<2 * width>(in0);
         cvec<T, width> sum1  = sum0;
 
-        CMT_LOOP_NOUNROLL
         for (size_t j = 0; j < halfradix; j++)
         {
             const cvec<T, 1> ina = cread<1>(in + (1 + j));
@@ -1442,6 +1636,36 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* 
         }
         cwrite<1>(out, hcadd(sum) + sums);
     }
+    const auto halfradix     = radix / 2;
+    const auto halfradix_sqr = halfradix * halfradix;
+    CMT_ASSUME(halfradix > 0);
+    size_t i = 0;
+
+    generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
+                            halfradix * halfradix, twiddle, i);
+}
+
+template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>>
+KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
+                                         const complex<T>* in, const complex<T>* twiddle,
+                                         Tstride ostride = Tstride{})
+{
+    {
+        cvec<T, width> sum = T();
+        size_t j           = 0;
+        CMT_LOOP_UNROLL
+        for (; j < radix / width * width; j += width)
+        {
+            sum += cread<width>(in + j);
+        }
+        cvec<T, 1> sums = T();
+        CMT_LOOP_UNROLL
+        for (; j < radix; j++)
+        {
+            sums += cread<1>(in + j);
+        }
+        cwrite<1>(out, hcadd(sum) + sums);
+    }
     const size_t halfradix     = radix / 2;
     const size_t halfradix_sqr = halfradix * halfradix;
     CMT_ASSUME(halfradix > 0);
@@ -1455,21 +1679,15 @@ template <typename T, bool inverse, typename Tstride = csize_t<1>>
 KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
                                   complex<T>* temp, const complex<T>* twiddle, Tstride ostride = Tstride{})
 {
-    if (out == in)
-    {
-        builtin_memcpy(temp, in, sizeof(complex<T>) * radix);
-        in = temp;
-    }
     constexpr size_t width = platform<T>::vector_width;
 
-    generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
-    /*cswitch(csizes_t<11>(), radix,
+    cswitch(csizes_t<11, 13>(), radix,
             [&](auto radix_) CMT_INLINE_LAMBDA {
-                generic_butterfly_w<width>(decltype(radix_)(), cbool_t<inverse>(), out, in, twiddle, ostride);
+                spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride);
             },
             [&]() CMT_INLINE_LAMBDA {
                 generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
-            });*/
+            });
 }
 
 template <typename T, size_t N>
diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp
@@ -191,13 +191,23 @@ struct test_case
         check(result, as_string(comparison.left), expr);
     }
 
+    void append_comment(const std::string& text)
+    {
+        comment += text;
+        if (show_progress)
+        {
+            println();
+            println(text, ":");
+        }
+    }
+
     void set_comment(const std::string& text)
     {
         comment = text;
         if (show_progress)
         {
             println();
-            println(comment, ":");
+            println(text, ":");
         }
     }
 
@@ -344,6 +354,6 @@ void assert_is_same_decay()
 #define TEST TESTO_TEST
 #define DTEST TESTO_DTEST
 #endif
-}
+} // namespace testo
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -10,6 +10,7 @@
 #include <kfr/dft.hpp>
 #include <kfr/dsp.hpp>
 #include <kfr/io.hpp>
+#include <set>
 
 using namespace kfr;
 
@@ -70,56 +71,71 @@ TEST(fft_accuracy)
 {
     testo::active_test()->show_progress = true;
     random_bit_generator gen(2247448713, 915890490, 864203735, 2982561);
-
-    testo::matrix(named("type")       = dft_float_types, //
-                  named("inverse")    = std::make_tuple(false, true), //
-                  named("log2(size)") = make_range(size_t(1), stopsize), //
-                  [&gen](auto type, bool inverse, size_t log2size) {
-                      using float_type  = type_of<decltype(type)>;
-                      const size_t size = 1 << log2size;
-
-                      {
-                          univector<complex<float_type>> in =
-                              truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
-                          univector<complex<float_type>> out    = in;
-                          univector<complex<float_type>> refout = out;
-                          const dft_plan<float_type> dft(size);
-                          univector<u8> temp(dft.temp_size);
-
-                          reference_dft(refout.data(), in.data(), size, inverse);
-                          dft.execute(out, out, temp, inverse);
-
-                          const float_type rms_diff = rms(cabs(refout - out));
-                          const double ops          = log2size * 100;
-                          const double epsilon      = std::numeric_limits<float_type>::epsilon();
-                          CHECK(rms_diff < epsilon * ops);
-                      }
-
-                      if (size >= 16)
-                      {
-                          univector<float_type> in =
-                              truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
-
-                          univector<complex<float_type>> out    = truncate(scalar(qnan), size);
-                          univector<complex<float_type>> refout = truncate(scalar(qnan), size);
-                          const dft_plan_real<float_type> dft(size);
-                          univector<u8> temp(dft.temp_size);
-
-                          reference_fft(refout.data(), in.data(), size);
-                          dft.execute(out, in, temp);
-                          const float_type rms_diff_r =
-                              rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
-                          const double ops     = log2size * 200;
-                          const double epsilon = std::numeric_limits<float_type>::epsilon();
-                          CHECK(rms_diff_r < epsilon * ops);
-
-                          univector<float_type> out2(size, 0.f);
-                          dft.execute(out2, out, temp);
-                          out2                         = out2 / size;
-                          const float_type rms_diff_r2 = rms(in - out2);
-                          CHECK(rms_diff_r2 < epsilon * ops);
-                      }
-                  });
+    std::set<size_t> size_set;
+    univector<size_t> sizes = truncate(1 + counter(), stopsize - 1);
+    sizes                   = round(pow(2.0, sizes));
+
+#ifndef KFR_DFT_NO_NPo2
+    univector<size_t> sizes2 = truncate(2 + counter(), 1024);
+    for (size_t s : sizes2)
+    {
+        if (std::find(sizes.begin(), sizes.end(), s) == sizes.end())
+            sizes.push_back(s);
+    }
+#endif
+    println(sizes);
+
+    testo::matrix(
+        named("type") = dft_float_types, //
+        named("size") = sizes, //
+        [&gen](auto type, size_t size) {
+            using float_type      = type_of<decltype(type)>;
+            const double min_prec = 0.000001 * std::log(size) * size;
+
+            for (bool inverse : { false, true })
+            {
+                testo::active_test()->append_comment(inverse ? "complex-inverse" : "complex-direct");
+                univector<complex<float_type>> in =
+                    truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+                univector<complex<float_type>> out    = in;
+                univector<complex<float_type>> refout = out;
+                univector<complex<float_type>> outo   = in;
+                const dft_plan<float_type> dft(size);
+                univector<u8> temp(dft.temp_size);
+
+                reference_dft(refout.data(), in.data(), size, inverse);
+                dft.execute(outo, in, temp, inverse);
+                dft.execute(out, out, temp, inverse);
+
+                const float_type rms_diff_inplace = rms(cabs(refout - out));
+                CHECK(rms_diff_inplace < min_prec);
+                const float_type rms_diff_outofplace = rms(cabs(refout - outo));
+                CHECK(rms_diff_outofplace < min_prec);
+            }
+
+            if (size >= 4 && is_poweroftwo(size))
+            {
+                univector<float_type> in = truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+
+                univector<complex<float_type>> out    = truncate(scalar(qnan), size);
+                univector<complex<float_type>> refout = truncate(scalar(qnan), size);
+                const dft_plan_real<float_type> dft(size);
+                univector<u8> temp(dft.temp_size);
+
+                testo::active_test()->append_comment("real-direct");
+                reference_fft(refout.data(), in.data(), size);
+                dft.execute(out, in, temp);
+                float_type rms_diff = rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
+                CHECK(rms_diff < min_prec);
+
+                univector<float_type> out2(size, 0.f);
+                testo::active_test()->append_comment("real-inverse");
+                dft.execute(out2, out, temp);
+                out2     = out2 / size;
+                rms_diff = rms(in - out2);
+                CHECK(rms_diff < min_prec);
+            }
+        });
 }
 
 #ifndef KFR_NO_MAIN
@@ -127,6 +143,6 @@ int main()
 {
     println(library_version(), " running on ", cpu_runtime());
 
-    return testo::run_all("", true);
+    return testo::run_all("", false);
 }
 #endif

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	README.md	\|	2	+-
M	include/kfr/dft/dft-src.cpp	\|	490	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
M	include/kfr/dft/fft.hpp	\|	16	+++++++++++++---
M	include/kfr/dft/ft.hpp	\|	330	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	include/kfr/testo/testo.hpp	\|	14	++++++++++++--
M	tests/dft_test.cpp	\|	118	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------