DFT performance improvements - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit a5251827c16beec42b386b5e78d0663ae37639d8
parent 6feabe6fffbd44c16a643159e0dc87bcc8f2282f
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Fri, 10 Nov 2023 08:44:48 +0000

DFT performance improvements

Diffstat:
M CMakeLists.txt  | 6 +++---
M include/kfr/dft/fft.hpp  | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M include/kfr/dft/impl/bitrev.hpp  | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M include/kfr/dft/impl/dft-fft.hpp  | 15 ++++++++++-----
M include/kfr/dft/impl/dft-impl.hpp  | 6 +++++-
M include/kfr/dft/impl/fft-impl.hpp  | 1110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M include/kfr/dft/impl/ft.hpp  | 85 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M include/kfr/simd/digitreverse.hpp  | 36 ++++++++++++++++++++++--------------
M include/kfr/simd/impl/specializations.hpp  | 63 +++++++++++++++++++++++++++++++--------------------------------
M tests/dft_test.cpp  | 22 +++++++++++++++-------

10 files changed, 1250 insertions(+), 312 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -163,8 +163,8 @@ if (APPLE)
 endif ()
 if (NOT IOS)
     if (NOT MSVC OR CLANG)
-        target_compile_options(kfr
-                               INTERFACE "${CLANG_ARG_PREFIX}-mstackrealign")
+        # target_compile_options(kfr
+                            #    INTERFACE "${CLANG_ARG_PREFIX}-mstackrealign")
     endif ()
 endif ()
 if (MSVC)
@@ -221,7 +221,7 @@ endfunction ()
 
 if (KFR_ENABLE_DFT)
 
-    set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast")
+    set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast -Xclang -O3 -mllvm -x86-use-vzeroupper=0")
 
     if (KFR_ENABLE_DFT_MULTIARCH)
         add_library(kfr_dft INTERFACE)
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -34,6 +34,7 @@
 #include "../simd/constants.hpp"
 #include "../simd/read_write.hpp"
 #include "../simd/vec.hpp"
+#include <bitset>
 
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wshadow")
@@ -49,6 +50,8 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
 namespace kfr
 {
 
+#define DFT_MAX_STAGES 32
+
 using cdirect_t = cfalse_t;
 using cinvert_t = ctrue_t;
 
@@ -67,17 +70,14 @@ struct dft_stage
     const char* name  = nullptr;
     bool recursion    = false;
     bool can_inplace  = true;
-    bool inplace      = false;
-    bool to_scratch   = false;
     bool need_reorder = true;
 
     void initialize(size_t size) { do_initialize(size); }
 
     virtual void dump() const
     {
-        printf("%s: %zu, %zu, %zu, %zu, %zu, %zu, %zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
-               stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
-               to_scratch);
+        printf("%s: %zu, %zu, %zu, %zu, %zu, %zu, %zu, %d, %d\n", name ? name : "unnamed", radix, stage_size,
+               data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace);
     }
 
     KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
@@ -173,7 +173,7 @@ struct dft_plan
 
     void dump() const
     {
-        for (const std::unique_ptr<dft_stage<T>>& s : stages)
+        for (const std::unique_ptr<dft_stage<T>>& s : all_stages)
         {
             s->dump();
         }
@@ -213,8 +213,89 @@ struct dft_plan
 
     autofree<u8> data;
     size_t data_size;
-    std::vector<dft_stage_ptr<T>> stages;
+
+    std::vector<dft_stage_ptr<T>> all_stages;
+    std::array<std::vector<dft_stage<T>*>, 2> stages;
     bool arblen;
+    using bitset = std::bitset<DFT_MAX_STAGES>;
+    std::array<bitset, 2> disposition_inplace;
+    std::array<bitset, 2> disposition_outofplace;
+
+    void calc_disposition()
+    {
+        for (bool inverse : { false, true })
+        {
+            auto&& stages = this->stages[inverse];
+            bitset can_inplace_per_stage;
+            for (int i = 0; i < stages.size(); ++i)
+            {
+                can_inplace_per_stage[i] = stages[i]->can_inplace;
+            }
+
+            disposition_inplace[static_cast<int>(inverse)] =
+                precompute_disposition(stages.size(), can_inplace_per_stage, true);
+            disposition_outofplace[static_cast<int>(inverse)] =
+                precompute_disposition(stages.size(), can_inplace_per_stage, false);
+        }
+    }
+
+    static bitset precompute_disposition(int num_stages, bitset can_inplace_per_stage, bool inplace_requested)
+    {
+        static bitset even{ 0x5555555555555555ull };
+        bitset mask = ~bitset() >> (DFT_MAX_STAGES - num_stages);
+        bitset result;
+        // disposition indicates where is input for corresponding stage
+        // first bit : 0 - input,  1 - scratch
+        // other bits: 0 - output, 1 - scratch
+
+        // build disposition that works always
+        if (num_stages % 2 == 0)
+        { // even
+            result = ~even & mask;
+        }
+        else
+        { // odd
+            result = even & mask;
+        }
+
+        int num_inplace = can_inplace_per_stage.count();
+
+#ifdef KFR_DFT_ELIMINATE_MEMCPY
+        if (num_inplace > 0 && inplace_requested)
+        {
+            if (result.test(0)) // input is in scratch
+            {
+                // num_inplace must be odd
+                if (num_inplace % 2 == 0)
+                    --num_inplace;
+            }
+            else
+            {
+                // num_inplace must be even
+                if (num_inplace % 2 != 0)
+                    --num_inplace;
+            }
+        }
+#endif
+        if (num_inplace > 0)
+        {
+            for (int i = num_stages - 1; i >= 0; --i)
+            {
+                if (can_inplace_per_stage.test(i))
+                {
+                    result ^= ~bitset() >> (DFT_MAX_STAGES - (i + 1));
+
+                    if (--num_inplace == 0)
+                        break;
+                }
+            }
+        }
+
+        if (!inplace_requested) // out-of-place first stage; IN->OUT
+            result.reset(0);
+
+        return result;
+    }
 
 protected:
     struct noinit
@@ -224,32 +305,35 @@ protected:
         : size(size), temp_size(0), data_size(0), arblen(false)
     {
     }
-    const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
-                                const complex<T>* scratch, bool in_scratch) const
+    const complex<T>* select_in(bitset disposition, size_t stage, const complex<T>* out, const complex<T>* in,
+                                const complex<T>* scratch) const
     {
-        if (stage == 0)
-            return in_scratch ? scratch : in;
-        return stages[stage - 1]->to_scratch ? scratch : out;
+        return disposition.test(stage) ? scratch : stage == 0 ? in : out;
     }
-    complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
+    complex<T>* select_out(bitset disposition, size_t stage, size_t total_stages, complex<T>* out,
+                           complex<T>* scratch) const
     {
-        return stages[stage]->to_scratch ? scratch : out;
+        return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out;
     }
 
     template <bool inverse>
     void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
     {
+        auto&& stages = this->stages[inverse];
         if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
         {
             return stages[0]->execute(cbool<inverse>, out, in, temp);
         }
-        size_t stack[32] = { 0 };
+        size_t stack[DFT_MAX_STAGES] = { 0 };
+
+        bitset disposition =
+            in == out ? this->disposition_inplace[inverse] : this->disposition_outofplace[inverse];
 
         complex<T>* scratch = ptr_cast<complex<T>>(
             temp + this->temp_size -
             align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
 
-        bool in_scratch = !stages[0]->can_inplace && in == out;
+        bool in_scratch = disposition.test(0);
         if (in_scratch)
         {
             builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
@@ -273,8 +357,8 @@ protected:
                     }
                     else
                     {
-                        complex<T>* rout      = select_out(rdepth, out, scratch);
-                        const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
+                        complex<T>* rout      = select_out(disposition, rdepth, stages.size(), out, scratch);
+                        const complex<T>* rin = select_in(disposition, rdepth, out, in, scratch);
                         stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
                         offset += stages[rdepth]->out_offset;
                         stack[rdepth]++;
@@ -291,8 +375,9 @@ protected:
                 size_t offset = 0;
                 while (offset < this->size)
                 {
-                    stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch) + offset,
-                                           select_in(depth, out, in, scratch, in_scratch) + offset, temp);
+                    stages[depth]->execute(
+                        cbool<inverse>, select_out(disposition, depth, stages.size(), out, scratch) + offset,
+                        select_in(disposition, depth, out, in, scratch) + offset, temp);
                     offset += stages[depth]->stage_size;
                 }
                 depth++;
@@ -306,7 +391,6 @@ struct dft_plan_real : dft_plan<T>
 {
     size_t size;
     dft_pack_format fmt;
-    dft_stage_ptr<T> fmt_stage;
 
     explicit dft_plan_real(cpu_t cpu, size_t size, dft_pack_format fmt = dft_pack_format::CCs)
         : dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt)
@@ -360,13 +444,10 @@ struct dft_plan_real : dft_plan<T>
     KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp, cdirect_t = {}) const
     {
         this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp);
-        fmt_stage->execute(cfalse, out, out, nullptr);
     }
     KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp, cinvert_t = {}) const
     {
-        complex<T>* outdata = ptr_cast<complex<T>>(out);
-        fmt_stage->execute(ctrue, outdata, in, nullptr);
-        this->execute_dft(ctrue, outdata, outdata, temp);
+        this->execute_dft(ctrue, ptr_cast<complex<T>>(out), in, temp);
     }
 
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
@@ -374,15 +455,12 @@ struct dft_plan_real : dft_plan<T>
                                    univector<u8, Tag3>& temp, cdirect_t = {}) const
     {
         this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data());
-        fmt_stage->execute(cfalse, out.data(), out.data(), nullptr);
     }
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
     KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
                                    univector<u8, Tag3>& temp, cinvert_t = {}) const
     {
-        complex<T>* outdata = ptr_cast<complex<T>>(out.data());
-        fmt_stage->execute(ctrue, outdata, in.data(), nullptr);
-        this->execute_dft(ctrue, outdata, outdata, temp.data());
+        this->execute_dft(ctrue, ptr_cast<complex<T>>(out.data()), in.data(), temp.data());
     }
 
     // Deprecated. fmt must be passed to constructor instead
@@ -434,7 +512,7 @@ struct dct_plan : dft_plan<T>
         }
         else
         {
-            mirrored = make_complex(make_univector(in, size) * cos(t), make_univector(in, size) * -sin(t));
+            mirrored    = make_complex(make_univector(in, size) * cos(t), make_univector(in, size) * -sin(t));
             mirrored[0] = mirrored[0] * T(0.5);
             dft_plan<T>::execute(mirrored_dft.data(), mirrored.data(), temp, cfalse);
             for (size_t i = 0; i < halfSize; i++)
diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp
@@ -58,7 +58,12 @@ CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x)
 CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits)
 {
     if (bits > bitrev_table_log2N)
-        return bitreverse<32>(x) >> (32 - bits);
+    {
+        if (bits <= 16)
+            return bitreverse<16>(x) >> (16 - bits);
+        else
+            return bitreverse<32>(x) >> (32 - bits);
+    }
 
     return data::bitrev_table[x] >> (bitrev_table_log2N - bits);
 }
@@ -66,7 +71,12 @@ CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits)
 CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits)
 {
     if (bits > bitrev_table_log2N)
-        return digitreverse4<32>(x) >> (32 - bits);
+    {
+        if (bits <= 16)
+            return digitreverse4<16>(x) >> (16 - bits);
+        else
+            return digitreverse4<32>(x) >> (32 - bits);
+    }
 
     x = data::bitrev_table[x];
     x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
@@ -218,7 +228,7 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>)
 }
 
 template <typename T>
-KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */)
 {
     constexpr size_t bitrev = 4;
     fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4);
@@ -232,6 +242,20 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>)
 }
 
 template <typename T>
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */)
+{
+    constexpr size_t bitrev = 2;
+    fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4);
+    fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4);
+    fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4);
+    fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4);
+    fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4);
+    fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4);
+    fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4);
+    fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4);
+}
+
+template <typename T>
 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
 {
     constexpr size_t bitrev = 2;
@@ -253,6 +277,44 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
     fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4);
 }
 
+template <typename T>
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */)
+{
+    constexpr size_t bitrev = 2;
+    fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4);
+    fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4);
+    fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4);
+    fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4);
+    fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4);
+}
+
 template <typename T, bool use_br2>
 KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
 {
@@ -285,22 +347,35 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
     {
         size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
         if (i >= j)
+        {
             fft_reorder_swap_n4(io, i, j, N4, use_br2);
+        }
+        else
+        {
+            i += 4 * istep;
+            continue;
+        }
         i += istep;
         j = j + jstep1;
 
         if (i >= j)
+        {
             fft_reorder_swap_n4(io, i, j, N4, use_br2);
+        }
         i += istep;
         j = j - jstep2;
 
         if (i >= j)
+        {
             fft_reorder_swap_n4(io, i, j, N4, use_br2);
+        }
         i += istep;
         j = j + jstep1;
 
         if (i >= j)
+        {
             fft_reorder_swap_n4(io, i, j, N4, use_br2);
+        }
         i += istep;
     }
 }
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -94,14 +94,19 @@ CMT_PRAGMA_GNU(GCC diagnostic push)
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
 #endif
 
-template <typename Stage, typename T, typename... Args>
-void add_stage(dft_plan<T>* self, Args... args)
+template <typename Stage, bool add_stages = true, typename T, typename... Args>
+void add_stage(dft_plan<T>* plan, Args... args)
 {
     dft_stage<T>* stage = new Stage(args...);
     stage->need_reorder = true;
-    self->data_size += stage->data_size;
-    self->temp_size += stage->temp_size;
-    self->stages.push_back(dft_stage_ptr<T>(stage));
+    plan->data_size += stage->data_size;
+    plan->temp_size += stage->temp_size;
+    plan->all_stages.push_back(dft_stage_ptr<T>(stage));
+    if constexpr (add_stages)
+    {
+        plan->stages[0].push_back(stage);
+        plan->stages[1].push_back(stage);
+    }
 }
 
 } // namespace CMT_ARCH_NAME
diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp
@@ -515,9 +515,11 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order)
                      }
                  });
 
+        int num_stages = 0;
         if (cur_size >= 101)
         {
             add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size);
+            ++num_stages;
             self->arblen = true;
         }
         else
@@ -535,6 +537,7 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order)
                         prepare_dft_stage(self, r, iterations, blocks, ctrue);
                     else
                         prepare_dft_stage(self, r, iterations, blocks, cfalse);
+                    ++num_stages;
                     blocks *= r;
                 }
             }
@@ -547,9 +550,10 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order)
                     prepare_dft_stage(self, cur_size, iterations, blocks, ctrue);
                 else
                     prepare_dft_stage(self, cur_size, iterations, blocks, cfalse);
+                ++num_stages;
             }
 
-            if (self->stages.size() > 2)
+            if (num_stages > 2)
                 add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size);
         }
     }
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -46,6 +46,57 @@ namespace kfr
 inline namespace CMT_ARCH_NAME
 {
 
+constexpr bool inline always_br2   = true;
+constexpr bool inline use_autosort = false;
+
+#define KFR_AUTOSORT_FOR_128D
+#define KFR_AUTOSORT_FOR_256D
+#define KFR_AUTOSORT_FOR_512
+#define KFR_AUTOSORT_FOR_1024
+
+#ifdef CMT_ARCH_AVX
+template <>
+KFR_INTRINSIC vec<float, 32> ctranspose<4, float, 32>(const vec<float, 32>& v16)
+{
+    cvec<float, 4> r0, r1, r2, r3;
+    split(v16, r0, r1, r2, r3);
+    const __m256d t0 = _mm256_unpacklo_pd(r0.v, r1.v);
+    const __m256d t1 = _mm256_unpacklo_pd(r2.v, r3.v);
+    const __m256d t2 = _mm256_unpackhi_pd(r0.v, r1.v);
+    const __m256d t3 = _mm256_unpackhi_pd(r2.v, r3.v);
+    r0.v             = _mm256_permute2f128_pd(t0, t1, 0x20);
+    r1.v             = _mm256_permute2f128_pd(t2, t3, 0x20);
+    r2.v             = _mm256_permute2f128_pd(t0, t1, 0x31);
+    r3.v             = _mm256_permute2f128_pd(t2, t3, 0x31);
+    return concat(r0, r1, r2, r3);
+}
+#endif
+
+template <>
+KFR_INTRINSIC vec<float, 64> ctranspose<8, float, 64>(const vec<float, 64>& v32)
+{
+    cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7;
+    split(v32, a0, a1, a2, a3, a4, a5, a6, a7);
+    cvec<float, 16> even = concat(a0, a2, a4, a6);
+    cvec<float, 16> odd  = concat(a1, a3, a5, a7);
+    even                 = ctranspose<4>(even);
+    odd                  = ctranspose<4>(odd);
+    return concat(even, odd);
+}
+
+template <>
+KFR_INTRINSIC vec<float, 64> ctranspose<4, float, 64>(const vec<float, 64>& v32)
+{
+    cvec<float, 16> lo, hi;
+    split(v32, lo, hi);
+    lo = ctranspose<4>(lo);
+    hi = ctranspose<4>(hi);
+    cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7;
+    split(lo, a0, a1, a2, a3);
+    split(hi, a4, a5, a6, a7);
+    return concat(a0, a4, a1, a5, a2, a6, a3, a7);
+}
+
 namespace intrinsics
 {
 
@@ -59,8 +110,13 @@ KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*spl
     ww                 = swap<2>(ww);
 
     if constexpr (inverse)
-        tw_ = -(tw_);
-    ww = subadd(b1, ww * dupodd(tw_));
+    {
+        ww = addsub(b1, ww * dupodd(tw_));
+    }
+    else
+    {
+        ww = subadd(b1, ww * dupodd(tw_));
+    }
     return ww;
 }
 
@@ -134,18 +190,18 @@ KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splito
 {
     const size_t N4 = N / 4;
     cvec<T, width> w1, w2, w3;
-    constexpr bool read_split  = !splitin && splitout;
-    constexpr bool write_split = splitin && !splitout;
+    constexpr bool read_split  = !splitin;
+    constexpr bool write_split = !splitout;
 
     vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
 
     split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
-    split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
     split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
-    split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
-
     const vec<T, width> sum02re = re0 + re2;
     const vec<T, width> sum02im = im0 + im2;
+
+    split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
+    split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
     const vec<T, width> sum13re = re1 + re3;
     const vec<T, width> sum13im = im1 + im3;
 
@@ -254,19 +310,38 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s
 #endif
 #endif
 
-template <typename T>
+template <size_t size = 1, typename T>
 KFR_INTRINSIC void prefetch_one(const complex<T>* in)
 {
     KFR_PREFETCH(in);
+    if constexpr (sizeof(complex<T>) * size > 64)
+        KFR_PREFETCH(in + 64);
+    if constexpr (sizeof(complex<T>) * size > 128)
+        KFR_PREFETCH(in + 128);
+    if constexpr (sizeof(complex<T>) * size > 192)
+        KFR_PREFETCH(in + 192);
 }
 
-template <typename T>
+template <size_t size = 1, typename T>
 KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in)
 {
-    KFR_PREFETCH(in);
-    KFR_PREFETCH(in + stride);
-    KFR_PREFETCH(in + stride * 2);
-    KFR_PREFETCH(in + stride * 3);
+    prefetch_one<size>(in);
+    prefetch_one<size>(in + stride);
+    prefetch_one<size>(in + stride * 2);
+    prefetch_one<size>(in + stride * 3);
+}
+
+template <size_t size = 1, typename T>
+KFR_INTRINSIC void prefetch_eight(size_t stride, const complex<T>* in)
+{
+    prefetch_one<size>(in);
+    prefetch_one<size>(in + stride);
+    prefetch_one<size>(in + stride * 2);
+    prefetch_one<size>(in + stride * 3);
+    prefetch_one<size>(in + stride * 4);
+    prefetch_one<size>(in + stride * 5);
+    prefetch_one<size>(in + stride * 6);
+    prefetch_one<size>(in + stride * 7);
 }
 
 template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
@@ -277,7 +352,7 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool
                                    const complex<T>*& twiddle)
 {
     static_assert(width > 0, "width cannot be zero");
-    constexpr static size_t prefetch_offset = width * 8;
+    constexpr static size_t prefetch_cycles = 8;
     const auto N4                           = N / csize_t<4>();
     const auto N43                          = N4 * csize_t<3>();
     CMT_ASSUME(blocks > 0);
@@ -286,16 +361,17 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool
     DFT_ASSERT(width <= N4);
     CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
     {
-        CMT_PRAGMA_CLANG(clang loop unroll_count(2))
-        for (size_t n2 = 0; n2 < N4; n2 += width)
+        CMT_LOOP_NOUNROLL
+        for (size_t n2 = 0; n2 < N4;)
         {
             if constexpr (prefetch)
-                prefetch_four(N4, in + prefetch_offset);
+                prefetch_four<width>(N4, in + width * prefetch_cycles);
             radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
                         cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
                         in, twiddle + n2 * 3);
             in += width;
             out += width;
+            n2 += width;
         }
         in += N43;
         out += N43;
@@ -304,41 +380,314 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool
     return {};
 }
 
-template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>,
-                                  cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
-                                  complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, bool inverse, typename T>
+KFR_INTRINSIC void radix2_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>,
+                                        cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>,
+                                        cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+                                        const complex<T>*& twiddle)
+
 {
-    CMT_ASSUME(blocks > 0);
-    constexpr static size_t prefetch_offset = 32 * 4;
-    for (size_t b = 0; b < blocks; b++)
+    for (size_t n = 0; n < stride; n++)
+    {
+        const cvec<T, 1> a = cread<1>(in + n);
+        const cvec<T, 1> b = cread<1>(in + n + stride);
+        cwrite<1>(out + n, a + b);
+        cwrite<1>(out + n + stride, a - b);
+    }
+}
+
+template <size_t N, bool inverse, bool split_format, typename T>
+KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3,
+                                    cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+{
+    if constexpr (split_format)
+    {
+        const cvec<T, N> sum02  = a0 + a2;
+        const cvec<T, N> diff02 = a0 - a2;
+        const cvec<T, N> sum13  = a1 + a3;
+        cvec<T, N> diff13       = a1 - a3;
+        vec<T, N> diff13re, diff13im, diff02re, diff02im;
+        split(diff02, diff02re, diff02im);
+        split(diff13, diff13re, diff13im);
+        w0 = sum02 + sum13;
+        w2 = sum02 - sum13;
+
+        (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
+        (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
+    }
+    else
+    {
+        const cvec<T, N> sum02  = a0 + a2;
+        const cvec<T, N> diff02 = a0 - a2;
+        const cvec<T, N> sum13  = a1 + a3;
+        cvec<T, N> diff13       = swap<2>(a1 - a3);
+        if constexpr (inverse)
+            diff13 = negodd(diff13);
+        else
+            diff13 = negeven(diff13);
+        w0 = sum02 + sum13;
+        w2 = sum02 - sum13;
+        w1 = diff02 - diff13;
+        w3 = diff02 + diff13;
+    }
+}
+
+template <size_t N, bool inverse, bool split_format, typename T>
+KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3,
+                                    cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3,
+                                    cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3)
+{
+    if constexpr (split_format)
+    {
+        const cvec<T, N> sum02  = a0 + a2;
+        const cvec<T, N> diff02 = a0 - a2;
+        const cvec<T, N> sum13  = a1 + a3;
+        cvec<T, N> diff13       = a1 - a3;
+        vec<T, N> diff13re, diff13im, diff02re, diff02im;
+        split(diff02, diff02re, diff02im);
+        split(diff13, diff13re, diff13im);
+
+        w0                  = sum02 + sum13;
+        w2                  = sum02 - sum13;
+        (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
+        (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
+
+        w2 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w2, tw2);
+        w1 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w1, tw1);
+        w3 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w3, tw3);
+    }
+    else
+    {
+        const cvec<T, N> sum02  = a0 + a2;
+        const cvec<T, N> diff02 = a0 - a2;
+        const cvec<T, N> sum13  = a1 + a3;
+        cvec<T, N> diff13       = swap<2>(a1 - a3);
+        if constexpr (inverse)
+            diff13 = negodd(diff13);
+        else
+            diff13 = negeven(diff13);
+
+        w0 = sum02 + sum13;
+        w2 = sum02 - sum13;
+        w1 = diff02 - diff13;
+        w3 = diff02 + diff13;
+        w2 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w2, tw2);
+        w1 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w1, tw1);
+        w3 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w3, tw3);
+    }
+}
+
+template <size_t N, bool split_format, typename T>
+KFR_INTRINSIC cvec<T, N> read_twiddle(const complex<T>* tw)
+{
+    if constexpr (split_format)
+    {
+        return concat(repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw))),
+                      repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw) + 1)));
+    }
+    else
+    {
+        return repeat<N>(cread<1>(tw));
+    }
+}
+
+template <size_t width_, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix4_autosort_pass_first(size_t N, csize_t<width_>, cbool_t<splitout>, cbool_t<splitin>,
+                                              cbool_t<prefetch>, cbool_t<inverse>, complex<T>* out,
+                                              const complex<T>* in, const complex<T>*& twiddle)
+{
+    static_assert(width_ > 0, "width cannot be zero");
+    const size_t N4        = N / 4;
+    const size_t Nstride   = N4;
+    constexpr size_t width = width_;
+    constexpr bool split   = splitin || splitout;
+    static_assert(!split);
+    constexpr static size_t prefetch_cycles = 8;
+
+    // CMT_LOOP_NOUNROLL
+    for (size_t b = 0; b < N4; b += width)
     {
         if constexpr (prefetch)
-            prefetch_four(csize_t<64>(), out + prefetch_offset);
-        cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
-        split(cread_split<8, aligned, splitin>(out + 0), w0, w1);
-        split(cread_split<8, aligned, splitin>(out + 8), w2, w3);
-        split(cread_split<8, aligned, splitin>(out + 16), w4, w5);
-        split(cread_split<8, aligned, splitin>(out + 24), w6, w7);
-
-        butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
-
-        w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
-        w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
-        w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
-        w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
-        w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
-        w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
-        w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
-
-        cvec<T, 8> z0, z1, z2, z3;
-        transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
-
-        butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
-        cwrite<32, aligned>(out, bitreverse<2>(concat(concat(z0, z1), concat(z2, z3))));
-        out += 32;
+            prefetch_four<width>(Nstride, in + prefetch_cycles * width);
+        cvec<T, width> tw1 = cread<width>(twiddle);
+        cvec<T, width> tw2 = cread<width>(twiddle + width);
+        cvec<T, width> tw3 = cread<width>(twiddle + 2 * width);
+
+        const cvec<T, width> a0 = cread<width>(in + 0 * Nstride);
+        const cvec<T, width> a1 = cread<width>(in + 1 * Nstride);
+        const cvec<T, width> a2 = cread<width>(in + 2 * Nstride);
+        const cvec<T, width> a3 = cread<width>(in + 3 * Nstride);
+        cvec<T, width> w0, w1, w2, w3;
+        radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3);
+        cvec<T, 4 * width> w0123 = concat(w0, w1, w2, w3);
+        w0123                    = ctranspose<width>(w0123);
+        cwrite<4 * width>(out, w0123);
+        twiddle += 3 * width;
+        in += width;
+        out += 4 * width;
+    }
+}
+
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix4_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>,
+                                             cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
+                                             complex<T>* out, const complex<T>* in,
+                                             const complex<T>*& twiddle)
+{
+    static_assert(width > 0, "width cannot be zero");
+    constexpr static size_t prefetch_cycles = 8;
+    constexpr bool split                    = splitin || splitout;
+    constexpr bool read_split               = !splitin && split;
+    constexpr bool write_split              = !splitout && split;
+    static_assert(!splitout);
+
+    CMT_PRAGMA_CLANG(clang loop unroll_count(4))
+    for (size_t n = 0; n < stride; n += width)
+    {
+        if constexpr (prefetch)
+            prefetch_four<width>(stride, in + prefetch_cycles * width);
+        const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride);
+        const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride);
+        const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride);
+        const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride);
+        cvec<T, width> w0, w1, w2, w3;
+        radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3);
+        cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+        cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+        cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+        cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+        in += width;
+        out += width;
     }
-    return {};
+}
+
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix8_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>,
+                                             cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
+                                             complex<T>* out, const complex<T>* in,
+                                             const complex<T>*& twiddle)
+{
+    static_assert(width > 0, "width cannot be zero");
+    constexpr static size_t prefetch_cycles = 4;
+    constexpr bool split                    = splitin || splitout;
+    constexpr bool read_split               = !splitin && split;
+    constexpr bool write_split              = !splitout && split;
+    static_assert(!splitout);
+
+    CMT_PRAGMA_CLANG(clang loop unroll_count(4))
+    for (size_t n = 0; n < stride; n += width)
+    {
+        if constexpr (prefetch)
+            prefetch_eight<width>(stride, in + prefetch_cycles * width);
+        const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride);
+        const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride);
+        const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride);
+        const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride);
+        const cvec<T, width> a4 = cread_split<width, false, read_split>(in + 4 * stride);
+        const cvec<T, width> a5 = cread_split<width, false, read_split>(in + 5 * stride);
+        const cvec<T, width> a6 = cread_split<width, false, read_split>(in + 6 * stride);
+        const cvec<T, width> a7 = cread_split<width, false, read_split>(in + 7 * stride);
+        cvec<T, width> w0, w1, w2, w3, w4, w5, w6, w7;
+        butterfly8<width, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, w0, w1, w2, w3, w4, w5, w6, w7);
+        cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+        cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+        cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+        cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+        cwrite_split<width, false, write_split>(out + 4 * stride, w4);
+        cwrite_split<width, false, write_split>(out + 5 * stride, w5);
+        cwrite_split<width, false, write_split>(out + 6 * stride, w6);
+        cwrite_split<width, false, write_split>(out + 7 * stride, w7);
+        in += width;
+        out += width;
+    }
+}
+
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix4_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>,
+                                        cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
+                                        complex<T>* out, const complex<T>* in, const complex<T>*& twiddle)
+{
+    static_assert(width > 0, "width cannot be zero");
+    constexpr static size_t prefetch_cycles = 8;
+    const size_t N4                         = N / 4;
+    const size_t Nstride                    = stride * N4;
+    const size_t stride3                    = 3 * stride;
+    constexpr bool split                    = splitin || splitout;
+    constexpr bool read_split               = !splitin && split;
+    constexpr bool write_split              = !splitout && split;
+
+    {
+        for (size_t n = 0; n < stride; n += width)
+        {
+            if constexpr (prefetch)
+                prefetch_four<width>(Nstride, in + prefetch_cycles * width);
+            const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride);
+            const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride);
+            const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride);
+            const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride);
+            cvec<T, width> w0, w1, w2, w3;
+            radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3);
+            cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+            cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+            cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+            cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+            in += width;
+            out += width;
+        }
+        twiddle += 3;
+        out += stride3;
+    }
+
+    // CMT_LOOP_NOUNROLL
+    for (size_t b = 1; b < N4; b++)
+    {
+        cvec<T, width> tw1 = read_twiddle<width, split>(twiddle);
+        cvec<T, width> tw2 = read_twiddle<width, split>(twiddle + 1);
+        cvec<T, width> tw3 = read_twiddle<width, split>(twiddle + 2);
+        for (size_t n = 0; n < stride; n += width)
+        {
+            if constexpr (prefetch)
+                prefetch_four<width>(Nstride, in + prefetch_cycles * width);
+            const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride);
+            const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride);
+            const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride);
+            const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride);
+            cvec<T, width> w0, w1, w2, w3;
+            radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3);
+            cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+            cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+            cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+            cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+            in += width;
+            out += width;
+        }
+        twiddle += 3;
+        out += stride3;
+    }
+}
+
+template <typename T>
+static void initialize_twiddle_autosort(size_t N, size_t w, complex<T>*& twiddle)
+{
+    for (size_t b = 0; b < N / 4; ++b)
+    {
+        cwrite<1>(twiddle + b / w * 3 * w + b % w + 0 * w, calculate_twiddle<T>(b, N));
+        cwrite<1>(twiddle + b / w * 3 * w + b % w + 1 * w, calculate_twiddle<T>(2 * b, N));
+        cwrite<1>(twiddle + b / w * 3 * w + b % w + 2 * w, calculate_twiddle<T>(3 * b, N));
+    }
+    twiddle += N / 4 * 3;
+}
+
+template <typename T>
+KFR_INTRINSIC void interleavehalves32(cvec<T, 32>& v0)
+{
+    cvec<T, 8> t0, t1, t2, t3;
+    split(v0, t0, t1, t2, t3);
+    t0 = interleavehalves(t0);
+    t1 = interleavehalves(t1);
+    t2 = interleavehalves(t2);
+    t3 = interleavehalves(t3);
+    v0 = concat(t0, t1, t2, t3);
 }
 
 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
@@ -347,22 +696,47 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfa
                                   complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
     CMT_ASSUME(blocks > 0);
-    DFT_ASSERT(2 <= blocks);
-    constexpr static size_t prefetch_offset = width * 16;
-    for (size_t b = 0; b < blocks; b += 2)
+    DFT_ASSERT(4 <= blocks);
+    constexpr static size_t prefetch_cycles = 8;
+    if constexpr (vector_capacity<T> >= 128)
     {
-        if constexpr (prefetch)
-            prefetch_one(out + prefetch_offset);
-
-        cvec<T, 8> vlo = cread<8, aligned>(out + 0);
-        cvec<T, 8> vhi = cread<8, aligned>(out + 8);
-        butterfly8<inverse>(vlo);
-        butterfly8<inverse>(vhi);
-        vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo);
-        vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi);
-        cwrite<8, aligned>(out, vlo);
-        cwrite<8, aligned>(out + 8, vhi);
-        out += 16;
+        CMT_PRAGMA_CLANG(clang loop unroll(disable))
+        for (size_t b = 0; b < blocks; b += 4)
+        {
+            if constexpr (prefetch)
+                prefetch_one<32>(out + prefetch_cycles * 32);
+
+            cvec<T, 32> v32 = cread<32, aligned>(out);
+            cvec<T, 4> v0, v1, v2, v3, v4, v5, v6, v7;
+            v32 = ctranspose<8>(v32);
+            split(v32, v0, v1, v2, v3, v4, v5, v6, v7);
+            butterfly8<4, inverse>(v0, v1, v2, v3, v4, v5, v6, v7);
+            v32 = concat(v0, v4, v2, v6, v1, v5, v3, v7);
+            v32 = ctranspose<4>(v32);
+            cwrite<32, aligned>(out, v32);
+
+            out += 32;
+        }
+    }
+    else
+    {
+        CMT_PRAGMA_CLANG(clang loop unroll(disable))
+        for (size_t b = 0; b < blocks; b += 2)
+        {
+            if constexpr (prefetch)
+                prefetch_one<16>(out + prefetch_cycles * 16);
+
+            cvec<T, 16> v16 = cread<16, aligned>(out);
+            cvec<T, 2> v0, v1, v2, v3, v4, v5, v6, v7;
+            v16 = ctranspose<8>(v16);
+            split(v16, v0, v1, v2, v3, v4, v5, v6, v7);
+            butterfly8<2, inverse>(v0, v1, v2, v3, v4, v5, v6, v7);
+            v16 = concat(v0, v4, v2, v6, v1, v5, v3, v7);
+            v16 = ctranspose<2>(v16);
+            cwrite<16, aligned>(out, v16);
+
+            out += 16;
+        }
     }
     return {};
 }
@@ -373,28 +747,67 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cf
                                   complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
     CMT_ASSUME(blocks > 0);
-    constexpr static size_t prefetch_offset = width * 4;
-    DFT_ASSERT(2 <= blocks);
-    CMT_PRAGMA_CLANG(clang loop unroll_count(2))
-    for (size_t b = 0; b < blocks; b += 2)
+    constexpr static size_t prefetch_cycles = 4;
+    DFT_ASSERT(4 <= blocks);
+    if constexpr (vector_capacity<T> >= 128)
     {
-        if constexpr (prefetch)
-            prefetch_one(out + prefetch_offset);
-
-        cvec<T, 16> vlo = cread<16, aligned>(out);
-        cvec<T, 16> vhi = cread<16, aligned>(out + 16);
-        butterfly4<4, inverse>(vlo);
-        butterfly4<4, inverse>(vhi);
-        apply_twiddles4<0, 4, 4, inverse>(vlo);
-        apply_twiddles4<0, 4, 4, inverse>(vhi);
-        vlo = digitreverse4<2>(vlo);
-        vhi = digitreverse4<2>(vhi);
-        butterfly4<4, inverse>(vlo);
-        butterfly4<4, inverse>(vhi);
-
-        use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
-        use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
-        out += 32;
+        CMT_PRAGMA_CLANG(clang loop unroll(disable))
+        for (size_t b = 0; b < blocks; b += 4)
+        {
+            if constexpr (prefetch)
+                prefetch_one<64>(out + prefetch_cycles * 64);
+
+            cvec<T, 16> v0 = cread<16, aligned>(out);
+            cvec<T, 16> v1 = cread<16, aligned>(out + 16);
+            cvec<T, 16> v2 = cread<16, aligned>(out + 32);
+            cvec<T, 16> v3 = cread<16, aligned>(out + 48);
+            butterfly4_packed<4, inverse>(v0);
+            butterfly4_packed<4, inverse>(v1);
+            butterfly4_packed<4, inverse>(v2);
+            butterfly4_packed<4, inverse>(v3);
+            apply_twiddles4<0, 4, 4, inverse>(v0);
+            apply_twiddles4<0, 4, 4, inverse>(v1);
+            apply_twiddles4<0, 4, 4, inverse>(v2);
+            apply_twiddles4<0, 4, 4, inverse>(v3);
+            v0 = digitreverse4<2>(v0);
+            v1 = digitreverse4<2>(v1);
+            v2 = digitreverse4<2>(v2);
+            v3 = digitreverse4<2>(v3);
+            butterfly4_packed<4, inverse>(v0);
+            butterfly4_packed<4, inverse>(v1);
+            butterfly4_packed<4, inverse>(v2);
+            butterfly4_packed<4, inverse>(v3);
+
+            use_br2 ? cbitreverse_write(out, v0) : cdigitreverse4_write(out, v0);
+            use_br2 ? cbitreverse_write(out + 16, v1) : cdigitreverse4_write(out + 16, v1);
+            use_br2 ? cbitreverse_write(out + 32, v2) : cdigitreverse4_write(out + 32, v2);
+            use_br2 ? cbitreverse_write(out + 48, v3) : cdigitreverse4_write(out + 48, v3);
+            out += 64;
+        }
+    }
+    else
+    {
+        CMT_PRAGMA_CLANG(clang loop unroll(disable))
+        for (size_t b = 0; b < blocks; b += 2)
+        {
+            if constexpr (prefetch)
+                prefetch_one<32>(out + prefetch_cycles * 32);
+
+            cvec<T, 16> vlo = cread<16, aligned>(out);
+            cvec<T, 16> vhi = cread<16, aligned>(out + 16);
+            butterfly4_packed<4, inverse>(vlo);
+            butterfly4_packed<4, inverse>(vhi);
+            apply_twiddles4<0, 4, 4, inverse>(vlo);
+            apply_twiddles4<0, 4, 4, inverse>(vhi);
+            vlo = digitreverse4<2>(vlo);
+            vhi = digitreverse4<2>(vhi);
+            butterfly4_packed<4, inverse>(vlo);
+            butterfly4_packed<4, inverse>(vhi);
+
+            use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
+            use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
+            out += 32;
+        }
     }
     return {};
 }
@@ -404,16 +817,18 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfa
                                   cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
                                   complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
 {
-    constexpr static size_t prefetch_offset = width * 4;
+    constexpr static size_t prefetch_cycles = 8;
     CMT_ASSUME(blocks > 8);
     DFT_ASSERT(8 <= blocks);
     for (size_t b = 0; b < blocks; b += 4)
     {
         if constexpr (prefetch)
-            prefetch_one(out + prefetch_offset);
+            prefetch_one<16>(out + prefetch_cycles * 16);
 
         cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
-        butterfly4<4, inverse>(v16);
+        butterfly4_packed<4, inverse>(v16);
+        if constexpr (use_br2)
+            v16 = permutegroups<(8), 0, 2, 1, 3>(v16);
         cdigitreverse4_write<aligned>(out, v16);
 
         out += 4 * 4;
@@ -447,6 +862,7 @@ struct fft_stage_impl : dft_stage<T>
     constexpr static bool prefetch = fft_config<T>::prefetch;
     constexpr static bool aligned  = false;
     constexpr static size_t width  = fft_config<T>::process_width;
+    constexpr static bool use_br2  = !is_even || always_br2;
 
     virtual void do_initialize(size_t size) override final
     {
@@ -464,7 +880,7 @@ struct fft_stage_impl : dft_stage<T>
         const size_t stg_size = this->stage_size;
         CMT_ASSUME(stg_size >= 2048);
         CMT_ASSUME(stg_size % 2048 == 0);
-        radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
+        radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
     }
 };
@@ -485,12 +901,18 @@ struct fft_final_stage_impl : dft_stage<T>
 
     constexpr static size_t width  = fft_config<T>::process_width;
     constexpr static bool is_even  = cometa::is_even(ilog2(size));
-    constexpr static bool use_br2  = !is_even;
+    constexpr static bool use_br2  = !is_even || always_br2;
     constexpr static bool aligned  = false;
     constexpr static bool prefetch = fft_config<T>::prefetch && splitin;
 
-    KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
-    KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
+    template <bool pass_splitin>
+    KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cbool_t<pass_splitin>, complex<T>*&)
+    {
+    }
+    template <bool pass_splitin>
+    KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cbool_t<pass_splitin>, complex<T>*&)
+    {
+    }
 
     static constexpr bool get_pass_splitout(size_t N) { return N / 4 > 8 && N / 4 / 4 >= width; }
 
@@ -518,33 +940,25 @@ struct fft_final_stage_impl : dft_stage<T>
         final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
     }
 
-    template <bool inverse, typename U = T, KFR_ENABLE_IF(std::is_same_v<U, float>)>
-    KFR_MEM_INTRINSIC void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                       const complex<T>*& twiddle)
-    {
-        radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
-                    cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
-    }
-
-    template <bool inverse, typename U = T, KFR_ENABLE_IF(std::is_same_v<U, float>)>
-    KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                       const complex<T>*& twiddle)
+    template <bool inverse, bool pass_splitin, typename U = T, KFR_ENABLE_IF(vector_capacity<U> >= 128)>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+                                       const complex<T>*, const complex<T>*& twiddle)
     {
         radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 
-    template <bool inverse>
-    KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                       const complex<T>*& twiddle)
+    template <bool inverse, bool pass_splitin>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+                                       const complex<T>*, const complex<T>*& twiddle)
     {
         radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
     }
 
-    template <bool inverse>
-    KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
-                                       const complex<T>*& twiddle)
+    template <bool inverse, bool pass_splitin>
+    KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+                                       const complex<T>*, const complex<T>*& twiddle)
     {
         radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
@@ -583,7 +997,77 @@ struct fft_reorder_stage_impl : dft_stage<T>
     template <bool inverse>
     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
     {
-        fft_reorder(out, this->user, cbool_t<!is_even>());
+        fft_reorder(out, this->user, cbool_t<(!is_even || always_br2)>());
+    }
+};
+
+template <typename T, bool is_first, bool is_last, bool radix8>
+struct fft_autosort_stage_impl : dft_stage<T>
+{
+    fft_autosort_stage_impl(size_t stage_size, size_t stride)
+    {
+        this->name         = dft_name(this);
+        this->radix        = radix8 ? 8 : 4;
+        this->stage_size   = stage_size * stride * this->radix;
+        this->blocks       = stage_size;
+        this->recursion    = false;
+        this->can_inplace  = is_last;
+        this->need_reorder = false;
+        this->user         = stride;
+        if constexpr (!is_last)
+        {
+            this->data_size =
+                align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
+        }
+    }
+
+    constexpr static bool prefetch = fft_config<T>::prefetch;
+    constexpr static bool aligned  = false;
+
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t total_size) final
+    {
+        if constexpr (!is_last)
+        {
+            complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+            if constexpr (is_first)
+                initialize_twiddle_autosort(this->blocks, width, twiddle);
+            else
+                initialize_twiddle_autosort(this->blocks, 1, twiddle);
+        }
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+    {
+        const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        const size_t stg_size     = this->blocks;
+        const size_t stride       = this->user;
+        if constexpr (is_first)
+        {
+            radix4_autosort_pass_first(stg_size, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
+                                       cbool_t<inverse>(), out, in, twiddle);
+        }
+        else if constexpr (is_last)
+        {
+            if constexpr (radix8)
+                radix8_autosort_pass_last(stride, csize_t<width / 2>(), cfalse, cfalse, cbool_t<prefetch>(),
+                                          cbool_t<inverse>(), out, in, twiddle);
+            else
+                radix4_autosort_pass_last(stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
+                                          cbool_t<inverse>(), out, in, twiddle);
+        }
+        else
+        {
+            if (stride == 4)
+                radix4_autosort_pass(stg_size, stride, csize_t<4>(), cfalse, cfalse, cbool_t<prefetch>(),
+                                     cbool_t<inverse>(), out, in, twiddle);
+            else
+                radix4_autosort_pass(stg_size, stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
+                                     cbool_t<inverse>(), out, in, twiddle);
+        }
     }
 };
 
@@ -593,7 +1077,11 @@ struct fft_specialization;
 template <typename T>
 struct fft_specialization<T, 0> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 1;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
@@ -608,7 +1096,11 @@ struct fft_specialization<T, 0> : dft_stage<T>
 template <typename T>
 struct fft_specialization<T, 1> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 2;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
@@ -625,7 +1117,11 @@ struct fft_specialization<T, 1> : dft_stage<T>
 template <typename T>
 struct fft_specialization<T, 2> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 4;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
@@ -642,7 +1138,11 @@ struct fft_specialization<T, 2> : dft_stage<T>
 template <typename T>
 struct fft_specialization<T, 3> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 8;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
@@ -650,7 +1150,7 @@ struct fft_specialization<T, 3> : dft_stage<T>
     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         cvec<T, 8> v8 = cread<8, aligned>(in);
-        butterfly8<inverse>(v8);
+        butterfly8_packed<inverse>(v8);
         cwrite<8, aligned>(out, v8);
     }
 };
@@ -658,7 +1158,11 @@ struct fft_specialization<T, 3> : dft_stage<T>
 template <typename T>
 struct fft_specialization<T, 4> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 16;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
@@ -666,7 +1170,7 @@ struct fft_specialization<T, 4> : dft_stage<T>
     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         cvec<T, 16> v16 = cread<16, aligned>(in);
-        butterfly16<inverse>(v16);
+        butterfly16_packed<inverse>(v16);
         cwrite<16, aligned>(out, v16);
     }
 };
@@ -674,7 +1178,11 @@ struct fft_specialization<T, 4> : dft_stage<T>
 template <typename T>
 struct fft_specialization<T, 5> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 32;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
@@ -682,25 +1190,100 @@ struct fft_specialization<T, 5> : dft_stage<T>
     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
         cvec<T, 32> v32 = cread<32, aligned>(in);
-        butterfly32<inverse>(v32);
+        butterfly32_packed<inverse>(v32);
         cwrite<32, aligned>(out, v32);
     }
 };
 
+#ifdef KFR_AUTOSORT_FOR_64
+template <typename T>
+struct fft_specialization<T, 6> : dft_stage<T>
+{
+    fft_specialization(size_t stage_size)
+    {
+        this->stage_size = 64;
+        this->name       = dft_name(this);
+        this->temp_size  = 64 * sizeof(complex<T>);
+        this->data_size  = 64 * sizeof(complex<T>);
+    }
+
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t) final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddle_autosort(64, width, twiddle);
+        initialize_twiddle_autosort(16, 1, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        auto no              = cfalse;
+        const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+        complex<T>* scratch  = ptr_cast<complex<T>>(temp);
+        radix4_autosort_pass_first(64, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+        radix4_autosort_pass(16, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+        radix4_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+    }
+};
+#else
 template <typename T>
 struct fft_specialization<T, 6> : dft_stage<T>
 {
-    fft_specialization(size_t) { this->name = dft_name(this); }
+    fft_specialization(size_t)
+    {
+        this->stage_size = 64;
+        this->name       = dft_name(this);
+    }
 
     constexpr static bool aligned = false;
     DFT_STAGE_FN
     template <bool inverse>
     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
     {
-        butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
+        butterfly64_memory(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
     }
 };
+#endif
 
+#ifdef KFR_AUTOSORT_FOR_128D
+template <>
+struct fft_specialization<double, 7> : dft_stage<double>
+{
+    using T = double;
+    fft_specialization(size_t stage_size)
+    {
+        this->stage_size = 128;
+        this->name       = dft_name(this);
+        this->temp_size  = 128 * sizeof(complex<T>);
+        this->data_size  = 128 * sizeof(complex<T>);
+    }
+
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t) final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddle_autosort(128, width, twiddle);
+        initialize_twiddle_autosort(32, 1, twiddle);
+        initialize_twiddle_autosort(8, 1, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        auto no              = cfalse;
+        const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+        complex<T>* scratch  = ptr_cast<complex<T>>(temp);
+        radix4_autosort_pass_first(128, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+        radix4_autosort_pass(32, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+        radix8_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+    }
+};
+#else
 template <>
 struct fft_specialization<double, 7> : dft_stage<double>
 {
@@ -741,6 +1324,7 @@ struct fft_specialization<double, 7> : dft_stage<double>
             fft_reorder(out, csize_t<7>());
     }
 };
+#endif
 
 template <>
 struct fft_specialization<float, 7> : dft_stage<float>
@@ -775,7 +1359,9 @@ struct fft_specialization<float, 7> : dft_stage<float>
         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
         radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
                     cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
-        radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+        radix4_pass(32, 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+                    cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+        radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
         if (this->need_reorder)
             fft_reorder(out, csize_t<7>());
@@ -787,8 +1373,9 @@ struct fft_specialization<float, 8> : dft_stage<float>
 {
     fft_specialization(size_t)
     {
-        this->name      = dft_name(this);
-        this->temp_size = sizeof(complex<float>) * 256;
+        this->stage_size = 256;
+        this->name       = dft_name(this);
+        this->temp_size  = sizeof(complex<float>) * 256;
     }
 
     using T = float;
@@ -824,13 +1411,62 @@ struct fft_specialization<float, 8> : dft_stage<float>
     }
 };
 
+#ifdef KFR_AUTOSORT_FOR_256D
+
+template <>
+struct fft_specialization<double, 8> : dft_stage<double>
+{
+    using T = double;
+    fft_specialization(size_t stage_size)
+    {
+        this->stage_size = 256;
+        this->name       = dft_name(this);
+        this->temp_size  = 256 * sizeof(complex<T>);
+        this->data_size  = 256 * sizeof(complex<T>);
+    }
+
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t) final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddle_autosort(256, width, twiddle);
+        initialize_twiddle_autosort(64, 1, twiddle);
+        initialize_twiddle_autosort(16, 1, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        auto no              = cfalse;
+        const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+        complex<T>* scratch  = ptr_cast<complex<T>>(temp);
+        if (in != out)
+        {
+            radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, out, in, tw);
+            radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw);
+            radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+            radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+        }
+        else
+        {
+            radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+            radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+            radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw);
+            radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+        }
+    }
+};
+#else
 template <>
 struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
 {
     using T = double;
     fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
     {
-        this->name = dft_name(this);
+        this->stage_size = 256;
+        this->name       = dft_name(this);
     }
 
     DFT_STAGE_FN
@@ -839,16 +1475,56 @@ struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
     {
         fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
         if (this->need_reorder)
-            fft_reorder(out, csize_t<8>());
+            fft_reorder(out, csize_t<8>(), cbool<always_br2>);
     }
 };
+#endif
+
+#ifdef KFR_AUTOSORT_FOR_512
+
+template <typename T>
+struct fft_specialization<T, 9> : dft_stage<T>
+{
+    fft_specialization(size_t stage_size)
+    {
+        this->stage_size = 512;
+        this->name       = dft_name(this);
+        this->temp_size  = 512 * sizeof(complex<T>);
+        this->data_size  = 512 * sizeof(complex<T>);
+    }
 
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t) final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddle_autosort(512, width, twiddle);
+        initialize_twiddle_autosort(128, 1, twiddle);
+        initialize_twiddle_autosort(32, 1, twiddle);
+        initialize_twiddle_autosort(8, 1, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        auto no              = cfalse;
+        const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+        complex<T>* scratch  = ptr_cast<complex<T>>(temp);
+        radix4_autosort_pass_first(512, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+        radix4_autosort_pass(128, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+        radix4_autosort_pass(32, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw);
+        radix8_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+    }
+};
+#else
 template <typename T>
 struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
 {
     fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
     {
-        this->name = dft_name(this);
+        this->stage_size = 512;
+        this->name       = dft_name(this);
     }
 
     DFT_STAGE_FN
@@ -860,7 +1536,47 @@ struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
             fft_reorder(out, csize_t<9>());
     }
 };
+#endif
 
+#ifdef KFR_AUTOSORT_FOR_1024
+template <typename T>
+struct fft_specialization<T, 10> : dft_stage<T>
+{
+    fft_specialization(size_t stage_size)
+    {
+        this->stage_size = 1024;
+        this->name       = dft_name(this);
+        this->temp_size  = 1024 * sizeof(complex<T>);
+        this->data_size  = 1024 * sizeof(complex<T>);
+    }
+
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t) final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddle_autosort(1024, width, twiddle);
+        initialize_twiddle_autosort(256, 1, twiddle);
+        initialize_twiddle_autosort(64, 1, twiddle);
+        initialize_twiddle_autosort(16, 1, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        auto no              = cfalse;
+        auto split           = cfalse;
+        const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+        complex<T>* scratch  = ptr_cast<complex<T>>(temp);
+        radix4_autosort_pass_first(1024, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+        radix4_autosort_pass(256, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+        radix4_autosort_pass(64, 16, csize<width>, split, no, no, cbool<inverse>, scratch, out, tw);
+        radix4_autosort_pass(16, 64, csize<width>, split, split, no, cbool<inverse>, out, scratch, tw);
+        radix4_autosort_pass_last(256, csize<width>, no, split, no, cbool<inverse>, out, out, tw);
+    }
+};
+#else
 template <typename T>
 struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
 {
@@ -878,23 +1594,86 @@ struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
             fft_reorder(out, 10, cfalse);
     }
 };
+#endif
+
+#ifdef KFR_AUTOSORT_FOR_2048
+template <typename T>
+struct fft_specialization<T, 11> : dft_stage<T>
+{
+    fft_specialization(size_t stage_size)
+    {
+        this->stage_size = 2048;
+        this->name       = dft_name(this);
+        this->temp_size  = 2048 * sizeof(complex<T>);
+        this->data_size  = 2048 * sizeof(complex<T>);
+    }
+
+    constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+    void do_initialize(size_t) final
+    {
+        complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+        initialize_twiddle_autosort(2048, width, twiddle);
+        initialize_twiddle_autosort(512, 1, twiddle);
+        initialize_twiddle_autosort(128, 1, twiddle);
+        initialize_twiddle_autosort(32, 1, twiddle);
+        initialize_twiddle_autosort(8, 1, twiddle);
+    }
+
+    DFT_STAGE_FN
+    template <bool inverse>
+    KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        auto no              = cfalse;
+        const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+        complex<T>* scratch  = ptr_cast<complex<T>>(temp);
+        radix4_autosort_pass_first(2048, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+        radix4_autosort_pass(512, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+        radix4_autosort_pass(128, 16, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw);
+        radix4_autosort_pass(32, 64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+        radix8_autosort_pass_last(256, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+    }
+};
+#endif
 
 } // namespace intrinsics
 
 template <bool is_even, bool first, typename T>
 void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>, cbool_t<first>)
 {
-    constexpr size_t final_size = is_even ? 1024 : 512;
-
-    if (stage_size >= 2048)
+    if constexpr (use_autosort)
     {
-        add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size);
-
-        make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
+        if (stage_size >= 16)
+        {
+            add_stage<intrinsics::fft_autosort_stage_impl<T, first, false, false>>(self, stage_size,
+                                                                                   self->size / stage_size);
+            make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
+        }
+        else
+        {
+            if (stage_size == 8)
+                add_stage<intrinsics::fft_autosort_stage_impl<T, false, true, true>>(self, stage_size,
+                                                                                     self->size / stage_size);
+            else
+                add_stage<intrinsics::fft_autosort_stage_impl<T, false, true, false>>(
+                    self, stage_size, self->size / stage_size);
+        }
     }
     else
     {
-        add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size);
+        constexpr size_t final_size = is_even ? 1024 : 512;
+
+        if (stage_size >= 2048)
+        {
+            add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size);
+
+            make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
+        }
+        else
+        {
+            add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size);
+            add_stage<intrinsics::fft_reorder_stage_impl<T, is_even>>(self, self->size);
+        }
     }
 }
 
@@ -935,7 +1714,7 @@ KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
 {
     self->data    = autofree<u8>(self->data_size);
     size_t offset = 0;
-    for (dft_stage_ptr<T>& stage : self->stages)
+    for (dft_stage_ptr<T>& stage : self->all_stages)
     {
         initialize_data_stage(self, stage, offset);
     }
@@ -945,21 +1724,10 @@ KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
 template <typename T>
 KFR_INTRINSIC void initialize_order(dft_plan<T>* self)
 {
-    bool to_scratch     = false;
-    bool scratch_needed = false;
-    for (dft_stage_ptr<T>& stage : reversed(self->stages))
-    {
-        if (to_scratch)
-        {
-            scratch_needed = true;
-        }
-        stage->to_scratch = to_scratch;
-        if (!stage->can_inplace)
-        {
-            to_scratch = !to_scratch;
-        }
-    }
-    if (scratch_needed || !self->stages[0]->can_inplace)
+    self->calc_disposition();
+    typename dft_plan<T>::bitset ored = self->disposition_inplace[0] | self->disposition_inplace[1] |
+                                        self->disposition_outofplace[0] | self->disposition_outofplace[1];
+    if (ored.any()) // if scratch needed
         self->temp_size += align_up(sizeof(complex<T>) * self->size, platform<>::native_cache_alignment);
 }
 
@@ -975,15 +1743,8 @@ KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order)
             constexpr size_t log2nv = val_of(decltype(log2n)());
             add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size);
         },
-        [&]()
-        {
-            cswitch(cfalse_true, is_even(log2n),
-                    [&](auto is_even)
-                    {
-                        make_fft(self, size, is_even, ctrue);
-                        constexpr size_t is_evenv = val_of(decltype(is_even)());
-                        add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(self, size);
-                    });
+        [&]() {
+            cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { make_fft(self, size, is_even, ctrue); });
         });
 }
 
@@ -1099,12 +1860,12 @@ void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const com
                       const cvec<T, width> fpk  = cread<width>(in + i);
                       const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
 
-        const cvec<T, width> f1k = fpk + fpnk;
-        const cvec<T, width> f2k = fpk - fpnk;
-        const cvec<T, width> t   = cmul_conj(f2k, tw);
-        cwrite<width>(out + i, f1k + t);
-        cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
-    });
+                      const cvec<T, width> f1k = fpk + fpnk;
+                      const cvec<T, width> f2k = fpk - fpnk;
+                      const cvec<T, width> t   = cmul_conj(f2k, tw);
+                      cwrite<width>(out + i, f1k + t);
+                      cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
+                  });
     if (is_even(csize))
     {
         size_t k              = csize / 2;
@@ -1151,10 +1912,11 @@ struct dft_stage_real_repack : dft_stage<T>
 public:
     dft_stage_real_repack(size_t real_size, dft_pack_format fmt)
     {
-        this->user       = static_cast<int>(fmt);
-        this->stage_size = real_size;
+        this->user         = static_cast<int>(fmt);
+        this->stage_size   = real_size;
+        this->can_inplace  = true;
         const size_t count = (real_size / 2 + 1) / 2;
-        this->data_size  = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment);
+        this->data_size    = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment);
     }
     void do_initialize(size_t) override
     {
@@ -1162,15 +1924,15 @@ public:
         constexpr size_t width = vector_width<T> * 2;
         size_t real_size       = this->stage_size;
         complex<T>* rtwiddle   = ptr_cast<complex<T>>(this->data);
-        const size_t count = (real_size / 2 + 1) / 2;
+        const size_t count     = (real_size / 2 + 1) / 2;
         block_process(count, csizes_t<width, 1>(),
                       [=](size_t i, auto w)
                       {
                           constexpr size_t width = val_of(decltype(w)());
                           cwrite<width>(
                               rtwiddle + i,
-                              cossin(dup(-constants<T>::pi *
-                                         ((enumerate<T, width>() + i + real_size / T(4)) / (real_size / 2)))));
+                              cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + real_size / T(4)) /
+                                                              (real_size / 2)))));
                       });
     }
     void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override
@@ -1191,10 +1953,10 @@ void dft_real_initialize(dft_plan_real<T>& plan)
     if (plan.size == 0)
         return;
     initialize_stages(&plan);
-    plan.fmt_stage.reset(new dft_stage_real_repack<T>(plan.size, plan.fmt));
-    plan.data_size += plan.fmt_stage->data_size;
-    size_t offset = initialize_data(&plan);
-    initialize_data_stage(&plan, plan.fmt_stage, offset);
+    add_stage<dft_stage_real_repack<T>, false>(&plan, plan.size, plan.fmt);
+    plan.stages[0].push_back(plan.all_stages.back().get());
+    plan.stages[1].insert(plan.stages[1].begin(), plan.all_stages.back().get());
+    initialize_data(&plan);
     initialize_order(&plan);
 }
 
diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp
@@ -37,7 +37,6 @@
 #include "../../base/memory.hpp"
 #include "../data/sincos.hpp"
 
-
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wpass-failed")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed")
@@ -181,7 +180,7 @@ template <size_t N, bool A = false, bool split = false, typename T>
 KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src)
 {
     cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
-    if (split)
+    if constexpr (split)
         temp = splitpairs(temp);
     return temp;
 }
@@ -190,7 +189,7 @@ template <size_t N, bool A = false, bool split = false, typename T>
 KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
 {
     cvec<T, N> v = value;
-    if (split)
+    if constexpr (split)
         v = interleavehalves(v);
     v.write(ptr_cast<T>(dest), cbool_t<A>());
 }
@@ -278,12 +277,14 @@ KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices
 template <size_t N, size_t stride, typename T>
 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base)
 {
-    if (stride == 1)
+    if constexpr (stride == 1)
     {
         return ref_cast<cvec<T, N>>(*base);
     }
     else
+    {
         return cgather_helper<N, stride, T>(base, csizeseq_t<N>());
+    }
 }
 
 KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
@@ -342,7 +343,7 @@ KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, cs
 template <size_t N, size_t stride, typename T>
 KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value)
 {
-    if (stride == 1)
+    if constexpr (stride == 1)
     {
         cwrite<N>(base, value);
     }
@@ -510,35 +511,35 @@ template <size_t k, size_t size, bool inverse = false, typename T, size_t width,
 KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
 {
     constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485);
-    if (kk == 0)
+    if constexpr (kk == 0)
     {
         return x;
     }
-    else if (kk == size * 1 / 8)
+    else if constexpr (kk == size * 1 / 8)
     {
         return swap<2>(subadd(swap<2>(x), x)) * isqrt2;
     }
-    else if (kk == size * 2 / 8)
+    else if constexpr (kk == size * 2 / 8)
     {
         return negodd(swap<2>(x));
     }
-    else if (kk == size * 3 / 8)
+    else if constexpr (kk == size * 3 / 8)
     {
         return subadd(x, swap<2>(x)) * -isqrt2;
     }
-    else if (kk == size * 4 / 8)
+    else if constexpr (kk == size * 4 / 8)
     {
         return -x;
     }
-    else if (kk == size * 5 / 8)
+    else if constexpr (kk == size * 5 / 8)
     {
         return swap<2>(subadd(swap<2>(x), x)) * -isqrt2;
     }
-    else if (kk == size * 6 / 8)
+    else if constexpr (kk == size * 6 / 8)
     {
         return swap<2>(negodd(x));
     }
-    else if (kk == size * 7 / 8)
+    else if constexpr (kk == size * 7 / 8)
     {
         return subadd(x, swap<2>(x)) * isqrt2;
     }
@@ -582,7 +583,7 @@ KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, c
     diff13 = high(diff0213);
     w0     = sum02 + sum13;
     w2     = sum02 - sum13;
-    if (inverse)
+    if constexpr (inverse)
     {
         diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T()));
         diff13 = swap<2>(diff13);
@@ -679,7 +680,7 @@ KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45,
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8)
+KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8)
 {
     cvec<T, 2> w0, w1, w2, w3;
     split<T, 16>(v8, w0, w1, w2, w3);
@@ -688,7 +689,7 @@ KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8)
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32)
+KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32)
 {
     cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
     split(v32, w0, w1, w2, w3, w4, w5, w6, w7);
@@ -710,7 +711,7 @@ KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32)
 }
 
 template <size_t N, bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123)
+KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123)
 {
     cvec<T, N> a0;
     cvec<T, N> a1;
@@ -722,7 +723,7 @@ KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123)
 }
 
 template <size_t N, typename T>
-KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01)
+KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01)
 {
     cvec<T, N> a0;
     cvec<T, N> a1;
@@ -734,14 +735,14 @@ KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01)
 template <size_t N, bool inverse = false, bool split_format = false, typename T>
 KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
 {
-    if (split_format)
+    if constexpr (split_format)
     {
         vec<T, N> re1, im1, tw1re, tw1im;
         split<T, 2 * N>(a1, re1, im1);
         split<T, 2 * N>(tw1, tw1re, tw1im);
         vec<T, N> b1re = re1 * tw1re;
         vec<T, N> b1im = im1 * tw1re;
-        if (inverse)
+        if constexpr (inverse)
             w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im);
         else
             w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im);
@@ -752,7 +753,7 @@ KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cv
         const cvec<T, N> a1_ = swap<2>(a1);
 
         cvec<T, N> tw1_ = tw1;
-        if (inverse)
+        if constexpr (inverse)
             tw1_ = -(tw1_);
         w1 = subadd(b1, a1_ * dupodd(tw1_));
     }
@@ -839,37 +840,38 @@ KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
 }
 
 template <bool inverse, bool aligned, typename T>
-KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
+                                      const complex<T>* in)
 {
     cvec<T, 16> w0, w1, w2, w3;
 
     w0 = cread_group<4, 4, 16, aligned>(
         in); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48));
-    butterfly4<4, inverse>(w0);
+    butterfly4_packed<4, inverse>(w0);
     apply_twiddles4<0, 1, 4, inverse>(w0);
 
     w1 = cread_group<4, 4, 16, aligned>(
         in + 4); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52));
-    butterfly4<4, inverse>(w1);
+    butterfly4_packed<4, inverse>(w1);
     apply_twiddles4<4, 1, 4, inverse>(w1);
 
     w2 = cread_group<4, 4, 16, aligned>(
         in + 8); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56));
-    butterfly4<4, inverse>(w2);
+    butterfly4_packed<4, inverse>(w2);
     apply_twiddles4<8, 1, 4, inverse>(w2);
 
     w3 = cread_group<4, 4, 16, aligned>(
         in + 12); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60));
-    butterfly4<4, inverse>(w3);
+    butterfly4_packed<4, inverse>(w3);
     apply_twiddles4<12, 1, 4, inverse>(w3);
 
     transpose4(w0, w1, w2, w3);
     // pass 2:
 
-    butterfly4<4, inverse>(w0);
-    butterfly4<4, inverse>(w1);
-    butterfly4<4, inverse>(w2);
-    butterfly4<4, inverse>(w3);
+    butterfly4_packed<4, inverse>(w0);
+    butterfly4_packed<4, inverse>(w1);
+    butterfly4_packed<4, inverse>(w2);
+    butterfly4_packed<4, inverse>(w3);
 
     transpose4(w0, w1, w2, w3);
 
@@ -881,26 +883,26 @@ KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* o
     apply_vertical_twiddles4<4, inverse>(w1, w2, w3);
 
     // pass 3:
-    butterfly4<4, inverse>(w3);
+    butterfly4_packed<4, inverse>(w3);
     cwrite_group<4, 4, 16, aligned>(out + 12, w3); //        split(w3, out[3], out[7], out[11], out[15]);
 
-    butterfly4<4, inverse>(w2);
+    butterfly4_packed<4, inverse>(w2);
     cwrite_group<4, 4, 16, aligned>(out + 8, w2); //        split(w2, out[2], out[6], out[10], out[14]);
 
-    butterfly4<4, inverse>(w1);
+    butterfly4_packed<4, inverse>(w1);
     cwrite_group<4, 4, 16, aligned>(out + 4, w1); //     split(w1, out[1], out[5], out[9], out[13]);
 
-    butterfly4<4, inverse>(w0);
+    butterfly4_packed<4, inverse>(w0);
     cwrite_group<4, 4, 16, aligned>(out, w0); //     split(w0, out[0], out[4], out[8], out[12]);
 }
 
 template <bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly16(cvec<T, 16>& v16)
+KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16)
 {
-    butterfly4<4, inverse>(v16);
+    butterfly4_packed<4, inverse>(v16);
     apply_twiddles4<0, 4, 4, inverse>(v16);
     v16 = digitreverse4<2>(v16);
-    butterfly4<4, inverse>(v16);
+    butterfly4_packed<4, inverse>(v16);
 }
 
 template <size_t index, bool inverse = false, typename T>
@@ -1505,7 +1507,7 @@ template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N
 KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
 {
     vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr));
-    if (transposed)
+    if constexpr (transposed)
         temp = ctranspose<sizeof...(N)>(temp);
     split(temp, w...);
 }
@@ -1533,7 +1535,7 @@ template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N
 KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
 {
     auto temp = concat(args...);
-    if (transposed)
+    if constexpr (transposed)
         temp = ctransposeinverse<sizeof...(N)>(temp);
     write(ptr_cast<T>(ptr), temp);
 }
@@ -1634,7 +1636,7 @@ KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t
             const cvec<T, 1> ina = cread<1>(in + (1 + j));
             const cvec<T, 1> inb = cread<1>(in + radix - (j + 1));
             cvec<T, width> tw    = cread<width>(twiddle);
-            if (inverse)
+            if constexpr (inverse)
                 tw = negodd /*cconj*/ (tw);
 
             cmul_2conj(sum0, sum1, ina, inb, tw);
@@ -1642,9 +1644,6 @@ KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t
         }
         twiddle = twiddle - halfradix_sqr + width;
 
-        //        if (inverse)
-        //            std::swap(sum0, sum1);
-
         if (is_constant_val(ostride))
         {
             cwrite<width>(out + (1 + i), sum0);
diff --git a/include/kfr/simd/digitreverse.hpp b/include/kfr/simd/digitreverse.hpp
@@ -39,29 +39,37 @@ CMT_PRAGMA_GNU(GCC diagnostic push)
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
 
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
+constexpr KFR_INTRINSIC u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
 
 template <u32 m, u32 shift, u32... values>
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
+constexpr KFR_INTRINSIC u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
 {
     return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>());
 }
 
 template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>)
+constexpr KFR_INTRINSIC u32 digitreverse_impl(u32 x, csize_t<2>)
 {
-    return bit_permute_step_impl(
-               x,
-               cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
-           (32 - bits);
+    constexpr cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>
+        steps{};
+    if constexpr (bits > 16)
+        return bit_permute_step_impl(x, steps) >> (32 - bits);
+    else if constexpr (bits > 8)
+        return bit_permute_step_impl(x, steps[csizeseq<8>]) >> (16 - bits);
+    else
+        return bit_permute_step_impl(x, steps[csizeseq<6>]) >> (8 - bits);
 }
 
 template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>)
+constexpr KFR_INTRINSIC u32 digitreverse_impl(u32 x, csize_t<4>)
 {
-    return bit_permute_step_impl(
-               x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
-           (32 - bits);
+    constexpr cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16> steps{};
+    if constexpr (bits > 16)
+        return bit_permute_step_impl(x, steps) >> (32 - bits);
+    else if constexpr (bits > 8)
+        return bit_permute_step_impl(x, steps[csizeseq<6>]) >> (16 - bits);
+    else
+        return bit_permute_step_impl(x, steps[csizeseq<4>]) >> (8 - bits);
 }
 
 CMT_PRAGMA_GNU(GCC diagnostic pop)
@@ -69,7 +77,7 @@ CMT_PRAGMA_GNU(GCC diagnostic pop)
 template <size_t radix, size_t bits>
 struct shuffle_index_digitreverse
 {
-    constexpr inline size_t operator()(size_t index) const CMT_NOEXCEPT
+    constexpr KFR_INTRINSIC size_t operator()(size_t index) const CMT_NOEXCEPT
     {
         return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>());
     }
@@ -96,13 +104,13 @@ KFR_INTRINSIC vec<T, N> digitreverse4(const vec<T, N>& x)
 }
 
 template <size_t bits>
-constexpr inline u32 bitreverse(u32 x)
+constexpr KFR_INTRINSIC u32 bitreverse(u32 x)
 {
     return internal::digitreverse_impl<bits>(x, csize_t<2>());
 }
 
 template <size_t bits>
-constexpr inline u32 digitreverse4(u32 x)
+constexpr KFR_INTRINSIC u32 digitreverse4(u32 x)
 {
     return internal::digitreverse_impl<bits>(x, csize_t<4>());
 }
diff --git a/include/kfr/simd/impl/specializations.hpp b/include/kfr/simd/impl/specializations.hpp
@@ -22,22 +22,20 @@
 #include "../shuffle.hpp"
 #endif
 
-#ifdef KFR_COMPILER_GNU
+#ifdef CMT_COMPILER_GNU
 
 namespace kfr
 {
 inline namespace CMT_ARCH_NAME
 {
 
-namespace intrinsics
-{
 template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
+template <>
+inline vec<f32, 32> vec<f32, 32>::shuffle(
     csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14,
-             15, 22, 23, 30, 31>,
-    const vec<f32, 32>& x, const vec<f32, 32>&)
+             15, 22, 23, 30, 31>) const CMT_NOEXCEPT
 {
-    f32x32 w = x;
+    f32x32 w = *this;
 
     w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)),
                permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w)));
@@ -47,12 +45,12 @@ inline vec<f32, 32> shufflevector<f32, 32>(
 }
 
 template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
+template <>
+inline vec<f32, 32> vec<f32, 32>::shuffle(
     csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22,
-             23, 14, 15, 30, 31>,
-    const vec<f32, 32>& x, const vec<f32, 32>&)
+             23, 14, 15, 30, 31>) const CMT_NOEXCEPT
 {
-    f32x32 w = x;
+    f32x32 w = *this;
 
     w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)),
                permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w)));
@@ -63,54 +61,55 @@ inline vec<f32, 32> shufflevector<f32, 32>(
 
 inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x)
 {
-    return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18,
-                                         19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>,
-                                  x, x);
+    return x.shuffle(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11,
+                            26, 27, 6, 7, 22, 23, 14, 15, 30, 31>);
 }
 
 template <>
-inline vec<f32, 64> shufflevector<f32, 64>(
+template <>
+inline vec<f32, 64> vec<f32, 64>::shuffle(
     csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13,
              44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38,
-             39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>,
-    const vec<f32, 64>& x, const vec<f32, 64>&)
+             39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>) const CMT_NOEXCEPT
 {
     return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(
-        concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x))));
+        concat(bitreverse_2(even<8>(*this)), bitreverse_2(odd<8>(*this))));
 }
 
 template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>,
-                                           const vec<f32, 16>& x, const vec<f32, 16>&)
+template <>
+inline vec<f32, 16> vec<f32, 16>::shuffle(
+    csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>) const CMT_NOEXCEPT
 {
-    //    asm volatile("int $3");
-    const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x);
+    const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(*this);
 
-    return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)),
-                  shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx)));
+    return concat(low(xx).shuffle(high(xx), csizes<0, 2, 8 + 0, 8 + 2, 4, 6, 8 + 4, 8 + 6>),
+                  low(xx).shuffle(high(xx), csizes<1, 3, 8 + 1, 8 + 3, 5, 7, 8 + 5, 8 + 7>));
 }
 
 template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>,
-                                           const vec<f32, 16>& x, const vec<f32, 16>&)
+template <>
+inline vec<f32, 16> vec<f32, 16>::shuffle(
+    csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>) const CMT_NOEXCEPT
 {
     const vec<f32, 16> xx =
-        concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
+        concat(low(*this).shuffle(high(*this), csizes<0, 8 + 0, 1, 8 + 1, 4, 8 + 4, 5, 8 + 5>),
+               low(*this).shuffle(high(*this), csizes<2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7>));
 
     return permutegroups<(4), 0, 2, 1, 3>(xx);
 }
 
 template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
+template <>
+inline vec<f32, 32> vec<f32, 32>::shuffle(
     csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
-             29, 14, 30, 15, 31>,
-    const vec<f32, 32>& x, const vec<f32, 32>&)
+             29, 14, 30, 15, 31>) const CMT_NOEXCEPT
 {
-    const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x);
+    const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(*this);
 
     return concat(interleavehalves(low(xx)), interleavehalves(high(xx)));
 }
-} // namespace intrinsics
+
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
 #endif
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -18,6 +18,15 @@ using namespace kfr;
 namespace CMT_ARCH_NAME
 {
 
+TEST(print_vector_capacity)
+{
+    println("vector_capacity<float> = ", vector_capacity<float>);
+    println("vector_capacity<double> = ", vector_capacity<double>);
+
+    println("fft_config<float>::process_width = ", vector_capacity<float> / 16);
+    println("fft_config<double>::process_width = ", vector_capacity<double> / 16);
+}
+
 #ifdef CMT_NATIVE_F64
 constexpr ctypes_t<float, double> dft_float_types{};
 #else
@@ -78,7 +87,7 @@ static void perf_test(int size)
 
 TEST(test_performance)
 {
-    for (int size = 16; size <= 16384; size <<= 1)
+    for (int size = 16; size <= 65536; size <<= 1)
     {
         perf_test(size);
     }
@@ -160,7 +169,7 @@ constexpr size_t fft_stopsize = 12;
 constexpr size_t dft_stopsize = 101;
 #endif
 #else
-constexpr size_t fft_stopsize = 20;
+constexpr size_t fft_stopsize = 21;
 #ifndef KFR_DFT_NO_NPo2
 constexpr size_t dft_stopsize = 257;
 #endif
@@ -183,15 +192,14 @@ TEST(fft_real)
 TEST(fft_real_not_size_4N)
 {
     kfr::univector<double, 6> in = counter();
-    auto out = realdft(in);
-    kfr::univector<kfr::complex<double>> expected { 
-        15.0, { -3, 5.19615242}, {-3, +1.73205081}, -3.0 };
+    auto out                     = realdft(in);
+    kfr::univector<kfr::complex<double>> expected{ 15.0, { -3, 5.19615242 }, { -3, +1.73205081 }, -3.0 };
     CHECK(rms(cabs(out - expected)) <= 0.00001f);
     kfr::univector<double, 6> rev = irealdft(out) / 6;
     CHECK(rms(rev - in) <= 0.00001f);
 
-    random_state gen = random_init(2247448713, 915890490, 864203735, 2982561);
-    constexpr size_t size = 66;
+    random_state gen                 = random_init(2247448713, 915890490, 864203735, 2982561);
+    constexpr size_t size            = 66;
     kfr::univector<double, size> in2 = gen_random_range<double>(gen, -1.0, +1.0);
     kfr::univector<kfr::complex<double>, size / 2 + 1> out2 = realdft(in2);
     kfr::univector<double, size> rev2                       = irealdft(out2) / size;

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	CMakeLists.txt	\|	6	+++---
M	include/kfr/dft/fft.hpp	\|	138	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	include/kfr/dft/impl/bitrev.hpp	\|	81	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M	include/kfr/dft/impl/dft-fft.hpp	\|	15	++++++++++-----
M	include/kfr/dft/impl/dft-impl.hpp	\|	6	+++++-
M	include/kfr/dft/impl/fft-impl.hpp	\|	1110	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	include/kfr/dft/impl/ft.hpp	\|	85	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	include/kfr/simd/digitreverse.hpp	\|	36	++++++++++++++++++++++--------------
M	include/kfr/simd/impl/specializations.hpp	\|	63	+++++++++++++++++++++++++++++++--------------------------------
M	tests/dft_test.cpp	\|	22	+++++++++++++++-------