DFT refactoring - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 03ef5fea15070ffca1574e7399086bc11d650002
parent 976e9ecbc82b42d5ba6b51cbbce42df394e60a11
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 16 Apr 2019 15:00:37 +0000

DFT refactoring

Diffstat:
M CMakeLists.txt  | 7 +++++++
A dft/CMakeLists.txt  | 32 ++++++++++++++++++++++++++++++++
M include/kfr/cident.h  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/dft/dft_c.h  | 12 ++++++------
M include/kfr/dft/fft.hpp  | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M include/kfr/dft/impl/convolution-impl.cpp  | 14 +++++++-------
M include/kfr/dft/impl/dft-fft.hpp  | 75 ++++++++++++++-------------------------------------------------------------
M include/kfr/dft/impl/dft-impl.hpp  | 37 ++++++++++++++++++-------------------
M include/kfr/dft/impl/dft-src.cpp  | 28 +++++++++++++---------------
M include/kfr/dft/impl/dft-templates.hpp  | 2 +-
M include/kfr/dft/impl/fft-impl.hpp  | 246 +++++++++++++++++++++++++++++++++++--------------------------------------------
M include/kfr/dft/impl/fft-templates.hpp  | 15 ++-------------

12 files changed, 465 insertions(+), 307 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -58,6 +58,9 @@ include(sources.cmake)
 option(ENABLE_TESTS "Enable tests and examples" OFF)
 if (CLANG)
     option(ENABLE_DFT "Enable DFT and related algorithms. Requires Clang" ON)
+    if (X86)
+        option(ENABLE_DFT_MULTIARCH "Build DFT static libraries for various architectures. Requires Clang" OFF)
+    endif ()
 endif ()
 option(ENABLE_ASMTEST "Enable writing disassembly" OFF)
 option(REGENERATE_TESTS "Regenerate auto tests" OFF)
@@ -176,6 +179,10 @@ if (ENABLE_DFT)
     else()
         target_compile_options(kfr_dft PRIVATE -ffast-math)
     endif()
+
+    if (ENABLE_DFT_MULTIARCH)
+        add_subdirectory(dft)
+    endif ()
 endif()
 
 add_library(kfr_io ${KFR_IO_SRC})
diff --git a/dft/CMakeLists.txt b/dft/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+# This file is part of KFR
+#
+# KFR is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# KFR is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with KFR.
+
+
+cmake_minimum_required(VERSION 3.1)
+
+function(add_dft_library ARCH)
+    add_library(kfr_dft_${ARCH} STATIC ${KFR_DFT_SRC})
+    target_link_libraries(kfr_dft_${ARCH} kfr)
+    target_set_arch(kfr_dft_${ARCH} INTERFACE ${ARCH})
+    target_compile_options(kfr_dft_${ARCH} PRIVATE -Xclang -ffast-math)
+endfunction()
+
+add_dft_library(sse2)
+add_dft_library(sse41)
+add_dft_library(avx)
+add_dft_library(avx2)
+add_dft_library(avx512)
+\ No newline at end of file
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -160,6 +160,74 @@ extern char* gets(char* __s);
 #define CMT_ARCH_NAME generic
 #endif
 
+#define CMT_ARCH_ID_GENERIC 0
+
+#define CMT_ARCH_ID_SSE2 1
+#define CMT_ARCH_ID_SSE3 2
+#define CMT_ARCH_ID_SSSE3 3
+#define CMT_ARCH_ID_SSE41 4
+#define CMT_ARCH_ID_SSE42 5
+#define CMT_ARCH_ID_AVX 6
+#define CMT_ARCH_ID_AVX2 7
+#define CMT_ARCH_ID_AVX512 8
+
+#define CMT_ARCH_ID_NEON 1
+#define CMT_ARCH_ID_NEON64 2
+
+#ifdef CMT_ENABLE_SSE2
+#define CMT_EXPAND_IF_ARCH_sse2(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse2(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE3
+#define CMT_EXPAND_IF_ARCH_sse3(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse3(...)
+#endif
+
+#ifdef CMT_ENABLE_SSSE3
+#define CMT_EXPAND_IF_ARCH_ssse3(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_ssse3(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE41
+#define CMT_EXPAND_IF_ARCH_sse41(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse41(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE41
+#define CMT_EXPAND_IF_ARCH_sse41(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse41(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE42
+#define CMT_EXPAND_IF_ARCH_sse42(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse42(...)
+#endif
+
+#ifdef CMT_ENABLE_AVX
+#define CMT_EXPAND_IF_ARCH_avx(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_avx(...)
+#endif
+
+#ifdef CMT_ENABLE_AVX2
+#define CMT_EXPAND_IF_ARCH_avx2(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_avx2(...)
+#endif
+
+#ifdef CMT_ENABLE_AVX512
+#define CMT_EXPAND_IF_ARCH_avx512(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_avx512(...)
+#endif
+
 #ifndef CMT_NO_NATIVE_F64
 #define CMT_NATIVE_F64 1
 #endif
diff --git a/include/kfr/dft/dft_c.h b/include/kfr/dft/dft_c.h
@@ -82,18 +82,18 @@ extern "C"
 
     // Real DFT plans
 
-    KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size);
-    KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size);
+    KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size, KFR_DFT_PACK_FORMAT pack_format);
+    KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size, KFR_DFT_PACK_FORMAT pack_format);
 
     void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, const float* in,
-                                  uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+                                  uint8_t* temp);
     void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, const double* in,
-                                  uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+                                  uint8_t* temp);
 
     void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out,
-                                          const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+                                          const float* in, uint8_t* temp);
     void kfr_dft_execute_real_inverse_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out,
-                                          const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+                                          const double* in, uint8_t* temp);
 
     void kfr_dft_delete_real_plan_f32(KFR_DFT_REAL_PLAN_F32* plan);
     void kfr_dft_delete_real_plan_f64(KFR_DFT_REAL_PLAN_F64* plan);
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -47,6 +47,53 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
 namespace kfr
 {
 
+using cdirect_t = cfalse_t;
+using cinvert_t = ctrue_t;
+
+template <typename T>
+struct dft_stage
+{
+    size_t radix      = 0;
+    size_t stage_size = 0;
+    size_t data_size  = 0;
+    size_t temp_size  = 0;
+    u8* data          = nullptr;
+    size_t repeats    = 1;
+    size_t out_offset = 0;
+    size_t blocks     = 0;
+    size_t user       = 0;
+    const char* name  = nullptr;
+    bool recursion    = false;
+    bool can_inplace  = true;
+    bool inplace      = false;
+    bool to_scratch   = false;
+    bool need_reorder = true;
+
+    void initialize(size_t size) { do_initialize(size); }
+
+    virtual void dump() const
+    {
+        printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
+               stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
+               to_scratch);
+    }
+
+    KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        do_execute(cdirect_t(), out, in, temp);
+    }
+    KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
+    {
+        do_execute(cinvert_t(), out, in, temp);
+    }
+    virtual ~dft_stage() {}
+
+protected:
+    virtual void do_initialize(size_t) {}
+    virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+    virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+};
+
 enum class dft_type
 {
     both,
@@ -60,24 +107,52 @@ enum class dft_order
     internal, // possibly bit/digit-reversed, implementation-defined, faster to compute
 };
 
-inline namespace CMT_ARCH_NAME
+enum class dft_pack_format
 {
+    Perm, // {X[0].r, X[N].r}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}
+    CCs // {X[0].r, 0}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i},  {X[N].r, 0}
+};
+
+template <typename T>
+struct dft_plan;
+
+template <typename T>
+struct dft_plan_real;
 
 template <typename T>
 struct dft_stage;
 
+template <typename T>
+using dft_stage_ptr = std::unique_ptr<dft_stage<T>>;
+
+inline namespace CMT_ARCH_NAME
+{
+template <typename T>
+void dft_initialize(dft_plan<T>& plan);
+template <typename T>
+void dft_real_initialize(dft_plan_real<T>& plan);
+} // namespace CMT_ARCH_NAME
+
 /// @brief Class for performing DFT/FFT
 template <typename T>
 struct dft_plan
 {
-    using dft_stage_ptr = std::unique_ptr<dft_stage<T>>;
-
     size_t size;
     size_t temp_size;
 
-    dft_plan(size_t size, dft_order order = dft_order::normal);
+    explicit dft_plan(size_t size, dft_order order = dft_order::normal)
+        : size(size), temp_size(0), data_size(0)
+    {
+        dft_initialize(*this);
+    }
 
-    void dump() const;
+    void dump() const
+    {
+        for (const std::unique_ptr<dft_stage<T>>& s : stages)
+        {
+            s->dump();
+        }
+    }
 
     KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
                                    bool inverse = false) const
@@ -87,7 +162,7 @@ struct dft_plan
         else
             execute_dft(cfalse, out, in, temp);
     }
-    ~dft_plan();
+    ~dft_plan() {}
     template <bool inverse>
     KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
                                    cbool_t<inverse> inv) const
@@ -111,44 +186,102 @@ struct dft_plan
         execute_dft(inv, out.data(), in.data(), temp.data());
     }
 
-protected:
     autofree<u8> data;
     size_t data_size;
-    std::vector<dft_stage_ptr> stages;
-    bool need_reorder;
-
-    template <typename Stage, typename... Args>
-    void add_stage(Args... args);
-
-    template <bool is_final>
-    void prepare_dft_stage(size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>);
-
-    template <bool is_even, bool first>
-    void make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>);
-
-    void initialize();
-    template <bool inverse>
-    void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const;
+    std::vector<dft_stage_ptr<T>> stages;
 
+protected:
+    struct noinit
+    {
+    };
+    explicit dft_plan(noinit, size_t size, dft_order order = dft_order::normal)
+        : size(size), temp_size(0), data_size(0)
+    {
+    }
     const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
-                                const complex<T>* scratch, bool in_scratch) const;
-    complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const;
-
-    void init_dft(size_t size, dft_order order);
-    void init_fft(size_t size, dft_order order);
-};
+                                const complex<T>* scratch, bool in_scratch) const
+    {
+        if (stage == 0)
+            return in_scratch ? scratch : in;
+        return stages[stage - 1]->to_scratch ? scratch : out;
+    }
+    complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
+    {
+        return stages[stage]->to_scratch ? scratch : out;
+    }
 
-enum class dft_pack_format
-{
-    Perm, // {X[0].r, X[N].r}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}
-    CCs // {X[0].r, 0}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i},  {X[N].r, 0}
+    template <bool inverse>
+    void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
+    {
+        if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+        {
+            return stages[0]->execute(cbool<inverse>, out, in, temp);
+        }
+        size_t stack[32] = { 0 };
+
+        complex<T>* scratch = ptr_cast<complex<T>>(
+            temp + this->temp_size -
+            align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
+
+        bool in_scratch = !stages[0]->can_inplace && in == out;
+        if (in_scratch)
+        {
+            builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
+        }
+
+        const size_t count = stages.size();
+
+        for (size_t depth = 0; depth < count;)
+        {
+            if (stages[depth]->recursion)
+            {
+                size_t offset   = 0;
+                size_t rdepth   = depth;
+                size_t maxdepth = depth;
+                do
+                {
+                    if (stack[rdepth] == stages[rdepth]->repeats)
+                    {
+                        stack[rdepth] = 0;
+                        rdepth--;
+                    }
+                    else
+                    {
+                        complex<T>* rout      = select_out(rdepth, out, scratch);
+                        const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
+                        stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
+                        offset += stages[rdepth]->out_offset;
+                        stack[rdepth]++;
+                        if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
+                            rdepth++;
+                        else
+                            maxdepth = rdepth;
+                    }
+                } while (rdepth != depth);
+                depth = maxdepth + 1;
+            }
+            else
+            {
+                stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
+                                       select_in(depth, out, in, scratch, in_scratch), temp);
+                depth++;
+            }
+        }
+    }
 };
 
 template <typename T>
 struct dft_plan_real : dft_plan<T>
 {
     size_t size;
-    dft_plan_real(size_t size);
+    dft_pack_format fmt;
+    dft_stage_ptr<T> fmt_stage;
+
+    explicit dft_plan_real(size_t size, dft_pack_format fmt = dft_pack_format::CCs)
+        : dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt)
+    {
+        dft_real_initialize(*this);
+    }
 
     void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete;
 
@@ -163,45 +296,50 @@ struct dft_plan_real : dft_plan<T>
     void execute(univector<complex<T>, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&,
                  cbool_t<inverse>) const = delete;
 
-    KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp,
-                                   dft_pack_format fmt = dft_pack_format::CCs) const
+    KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp) const
     {
         this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp);
-        to_fmt(out, fmt);
+        fmt_stage->execute(cfalse, out, out, nullptr);
     }
-    KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp,
-                                   dft_pack_format fmt = dft_pack_format::CCs) const
+    KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp) const
     {
         complex<T>* outdata = ptr_cast<complex<T>>(out);
-        from_fmt(outdata, in, fmt);
+        fmt_stage->execute(cfalse, outdata, in, nullptr);
         this->execute_dft(ctrue, outdata, outdata, temp);
     }
 
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
     KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in,
-                                   univector<u8, Tag3>& temp,
-                                   dft_pack_format fmt = dft_pack_format::CCs) const
+                                   univector<u8, Tag3>& temp) const
     {
         this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data());
-        to_fmt(out.data(), fmt);
+        fmt_stage->execute(cfalse, out.data(), out.data(), nullptr);
     }
     template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
     KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
-                                   univector<u8, Tag3>& temp,
-                                   dft_pack_format fmt = dft_pack_format::CCs) const
+                                   univector<u8, Tag3>& temp) const
     {
         complex<T>* outdata = ptr_cast<complex<T>>(out.data());
-        from_fmt(outdata, in.data(), fmt);
+        fmt_stage->execute(ctrue, outdata, in.data(), nullptr);
         this->execute_dft(ctrue, outdata, outdata, temp.data());
     }
 
-private:
-    univector<complex<T>> rtwiddle;
+    // Deprecated. fmt must be passed to constructor instead
+    void execute(complex<T>*, const T*, u8*, dft_pack_format) const = delete;
+    void execute(T*, const complex<T>*, u8*, dft_pack_format) const = delete;
 
-    void to_fmt(complex<T>* out, dft_pack_format fmt) const;
-    void from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const;
+    // Deprecated. fmt must be passed to constructor instead
+    template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
+    void execute(univector<complex<T>, Tag1>&, const univector<T, Tag2>&, univector<u8, Tag3>&,
+                 dft_pack_format) const = delete;
+    template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
+    void execute(univector<T, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&,
+                 dft_pack_format) const = delete;
 };
 
+inline namespace CMT_ARCH_NAME
+{
+
 template <typename T, univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
 void fft_multiply(univector<complex<T>, Tag1>& dest, const univector<complex<T>, Tag2>& src1,
                   const univector<complex<T>, Tag3>& src2, dft_pack_format fmt = dft_pack_format::CCs)
diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp
@@ -82,16 +82,16 @@ univector<T> autocorrelate(const univector_ref<const T>& src1)
 
 template <typename T>
 convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
-    : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size),
-      segments((size + block_size - 1) / block_size)
+    : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm),
+      temp(fft.temp_size), segments((size + block_size - 1) / block_size)
 
 {
 }
 
 template <typename T>
 convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size)
-    : size(data.size()), block_size(next_poweroftwo(block_size)), fft(2 * next_poweroftwo(block_size)),
-      temp(fft.temp_size),
+    : size(data.size()), block_size(next_poweroftwo(block_size)),
+      fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm), temp(fft.temp_size),
       segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
       ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
       input_position(0), position(0)
@@ -110,7 +110,7 @@ void convolve_filter<T>::set_data(const univector<T>& data)
         ir_segments[i].resize(block_size, 0);
         input = padded(data.slice(i * block_size, block_size));
 
-        fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm);
+        fft.execute(ir_segments[i], input, temp);
         process(ir_segments[i], ir_segments[i] * ifftsize);
     }
     saved_input.resize(block_size, 0);
@@ -130,7 +130,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
         builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T));
 
         process(scratch, padded(saved_input));
-        fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
+        fft.execute(segments[position], scratch, temp);
 
         if (input_position == 0)
         {
@@ -143,7 +143,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
         }
         fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm);
 
-        fft.execute(scratch, cscratch, temp, dft_pack_format::Perm);
+        fft.execute(scratch, cscratch, temp);
 
         process(make_univector(output + processed, processing),
                 scratch.slice(input_position) + overlap.slice(input_position));
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -38,60 +38,6 @@
 namespace kfr
 {
 
-inline namespace CMT_ARCH_NAME
-{
-
-#define DFT_ASSERT TESTO_ASSERT_INACTIVE
-
-template <typename T>
-constexpr size_t fft_vector_width = vector_width<T>;
-
-using cdirect_t = cfalse_t;
-using cinvert_t = ctrue_t;
-
-template <typename T>
-struct dft_stage
-{
-    size_t radix      = 0;
-    size_t stage_size = 0;
-    size_t data_size  = 0;
-    size_t temp_size  = 0;
-    u8* data          = nullptr;
-    size_t repeats    = 1;
-    size_t out_offset = 0;
-    size_t blocks     = 0;
-    const char* name  = nullptr;
-    bool recursion    = false;
-    bool can_inplace  = true;
-    bool inplace      = false;
-    bool to_scratch   = false;
-    bool need_reorder = true;
-
-    void initialize(size_t size) { do_initialize(size); }
-
-    virtual void dump() const
-    {
-        printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
-               stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
-               to_scratch);
-    }
-
-    KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
-    {
-        do_execute(cdirect_t(), out, in, temp);
-    }
-    KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
-    {
-        do_execute(cinvert_t(), out, in, temp);
-    }
-    virtual ~dft_stage() {}
-
-protected:
-    virtual void do_initialize(size_t) {}
-    virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
-    virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
-};
-
 #define DFT_STAGE_FN                                                                                         \
     void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override                     \
     {                                                                                                        \
@@ -102,20 +48,27 @@ protected:
         return do_execute<true>(out, in, temp);                                                              \
     }
 
+inline namespace CMT_ARCH_NAME
+{
+
+#define DFT_ASSERT TESTO_ASSERT_INACTIVE
+
+template <typename T>
+constexpr size_t fft_vector_width = vector_width<T>;
+
 CMT_PRAGMA_GNU(GCC diagnostic push)
 #if CMT_HAS_WARNING("-Wassume")
 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
 #endif
 
-template <typename T>
-template <typename Stage, typename... Args>
-void dft_plan<T>::add_stage(Args... args)
+template <typename Stage, typename T, typename... Args>
+void add_stage(dft_plan<T>* self, Args... args)
 {
     dft_stage<T>* stage = new Stage(args...);
-    stage->need_reorder = need_reorder;
-    this->data_size += stage->data_size;
-    this->temp_size += stage->temp_size;
-    stages.push_back(dft_stage_ptr(stage));
+    stage->need_reorder = true;
+    self->data_size += stage->data_size;
+    self->temp_size += stage->temp_size;
+    self->stages.push_back(dft_stage_ptr<T>(stage));
 }
 
 } // namespace CMT_ARCH_NAME
diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp
@@ -166,7 +166,7 @@ struct fft_inverse : internal::expression_with_arguments<E>
     }
 
     friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, cinput_t input,
-                                                             size_t index, vec_shape<value_type, 1>)
+                                                         size_t index, vec_shape<value_type, 1>)
     {
         return self.argument_first(input, index == 0 ? 0 : self.size() - index, vec_shape<value_type, 1>());
     }
@@ -459,32 +459,31 @@ protected:
 };
 } // namespace intrinsics
 
-template <typename T>
-template <bool is_final>
-void dft_plan<T>::prepare_dft_stage(size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>)
+template <bool is_final, typename T>
+void prepare_dft_stage(dft_plan<T>* self, size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>)
 {
     return cswitch(
         dft_radices, radix,
-        [this, iterations, blocks](auto radix) CMT_INLINE_LAMBDA {
+        [self, iterations, blocks](auto radix) CMT_INLINE_LAMBDA {
             add_stage<conditional<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>,
-                                  intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations,
-                                                                                       blocks);
+                                  intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(self, radix,
+                                                                                       iterations, blocks);
         },
-        [this, radix, iterations, blocks]() {
-            add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks);
+        [self, radix, iterations, blocks]() {
+            add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(self, radix, iterations, blocks);
         });
 }
 
 template <typename T>
-void dft_plan<T>::init_dft(size_t size, dft_order)
+void init_dft(dft_plan<T>* self, size_t size, dft_order)
 {
     if (size == 60)
     {
-        this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>();
+        add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(self);
     }
     else if (size == 48)
     {
-        this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>();
+        add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(self);
     }
     else
     {
@@ -504,7 +503,7 @@ void dft_plan<T>::init_dft(size_t size, dft_order)
 
         if (cur_size >= 101)
         {
-            this->add_stage<intrinsics::dft_arblen_stage_impl<T>>(size);
+            add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size);
         }
         else
         {
@@ -518,9 +517,9 @@ void dft_plan<T>::init_dft(size_t size, dft_order)
                     iterations /= r;
                     radices[radices_size++] = r;
                     if (iterations == 1)
-                        this->prepare_dft_stage(r, iterations, blocks, ctrue);
+                        prepare_dft_stage(self, r, iterations, blocks, ctrue);
                     else
-                        this->prepare_dft_stage(r, iterations, blocks, cfalse);
+                        prepare_dft_stage(self, r, iterations, blocks, cfalse);
                     blocks *= r;
                 }
             }
@@ -530,13 +529,13 @@ void dft_plan<T>::init_dft(size_t size, dft_order)
                 iterations /= cur_size;
                 radices[radices_size++] = cur_size;
                 if (iterations == 1)
-                    this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+                    prepare_dft_stage(self, cur_size, iterations, blocks, ctrue);
                 else
-                    this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+                    prepare_dft_stage(self, cur_size, iterations, blocks, cfalse);
             }
 
-            if (stages.size() > 2)
-                this->add_stage<intrinsics::dft_reorder_stage_impl<T>>(radices, radices_size);
+            if (self->stages.size() > 2)
+                add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size);
         }
     }
 }
diff --git a/include/kfr/dft/impl/dft-src.cpp b/include/kfr/dft/impl/dft-src.cpp
@@ -80,42 +80,40 @@ extern "C"
 
     // Real DFT plans
 
-    KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size)
+    KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size, KFR_DFT_PACK_FORMAT pack_format)
     {
-        return reinterpret_cast<KFR_DFT_REAL_PLAN_F32*>(new kfr::dft_plan_real<float>(size));
+        return reinterpret_cast<KFR_DFT_REAL_PLAN_F32*>(
+            new kfr::dft_plan_real<float>(size, static_cast<dft_pack_format>(pack_format)));
     }
-    KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size)
+    KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size, KFR_DFT_PACK_FORMAT pack_format)
     {
-        return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(new kfr::dft_plan_real<double>(size));
+        return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(
+            new kfr::dft_plan_real<double>(size, static_cast<dft_pack_format>(pack_format)));
     }
 
     void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
-                                  uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+                                  uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
-            reinterpret_cast<kfr::complex<float>*>(out), in, temp,
-            static_cast<kfr::dft_pack_format>(pack_format));
+            reinterpret_cast<kfr::complex<float>*>(out), in, temp);
     }
     void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
-                                  uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+                                  uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
-            reinterpret_cast<kfr::complex<double>*>(out), in, temp,
-            static_cast<kfr::dft_pack_format>(pack_format));
+            reinterpret_cast<kfr::complex<double>*>(out), in, temp);
     }
     void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
-                                          uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+                                          uint8_t* temp)
     {
         reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
-            out, reinterpret_cast<const kfr::complex<float>*>(in), temp,
-            static_cast<kfr::dft_pack_format>(pack_format));
+            out, reinterpret_cast<const kfr::complex<float>*>(in), temp);
     }
     void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
                                            uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
     {
         reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
-            out, reinterpret_cast<const kfr::complex<double>*>(in), temp,
-            static_cast<kfr::dft_pack_format>(pack_format));
+            out, reinterpret_cast<const kfr::complex<double>*>(in), temp);
     }
 
     void kfr_dft_delete_real_plan_f32(KFR_DFT_REAL_PLAN_F32* plan)
diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp
@@ -33,7 +33,7 @@ inline namespace CMT_ARCH_NAME
 {
 
 #ifndef KFR_DFT_NO_NPo2
-template void dft_plan<FLOAT>::init_dft(size_t, dft_order);
+template void init_dft(dft_plan<FLOAT>*, size_t, dft_order);
 #endif
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -559,20 +559,18 @@ struct fft_reorder_stage_impl : dft_stage<T>
     {
         this->name       = type_name<decltype(*this)>();
         this->stage_size = stage_size;
-        log2n            = ilog2(stage_size);
+        this->user       = ilog2(stage_size);
         this->data_size  = 0;
     }
 
 protected:
-    size_t log2n;
-
     virtual void do_initialize(size_t) override final {}
 
     DFT_STAGE_FN
     template <bool inverse>
     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
     {
-        fft_reorder(out, log2n, cbool_t<!is_even>());
+        fft_reorder(out, this->user, cbool_t<!is_even>());
     }
 };
 
@@ -841,21 +839,20 @@ struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
 
 } // namespace intrinsics
 
-template <typename T>
-template <bool is_even, bool first>
-void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>)
+template <bool is_even, bool first, typename T>
+KFR_INTRINSIC void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>, cbool_t<first>)
 {
     constexpr size_t final_size = is_even ? 1024 : 512;
 
     if (stage_size >= 2048)
     {
-        add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(stage_size);
+        add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size);
 
-        make_fft(stage_size / 4, cbool_t<is_even>(), cfalse);
+        make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
     }
     else
     {
-        add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(final_size);
+        add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size);
     }
 }
 
@@ -866,38 +863,49 @@ struct reverse_wrapper
 };
 
 template <typename T>
-auto begin(reverse_wrapper<T> w)
+KFR_INTRINSIC auto begin(reverse_wrapper<T> w)
 {
     return std::rbegin(w.iterable);
 }
 
 template <typename T>
-auto end(reverse_wrapper<T> w)
+KFR_INTRINSIC auto end(reverse_wrapper<T> w)
 {
     return std::rend(w.iterable);
 }
 
 template <typename T>
-reverse_wrapper<T> reversed(T&& iterable)
+KFR_INTRINSIC reverse_wrapper<T> reversed(T&& iterable)
 {
     return { iterable };
 }
 
 template <typename T>
-void dft_plan<T>::initialize()
+KFR_INTRINSIC void initialize_data_stage(dft_plan<T>* self, const dft_stage_ptr<T>& stage, size_t& offset)
+{
+    stage->data = self->data.data() + offset;
+    stage->initialize(self->size);
+    offset += stage->data_size;
+}
+
+template <typename T>
+KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
 {
-    data          = autofree<u8>(data_size);
+    self->data    = autofree<u8>(self->data_size);
     size_t offset = 0;
-    for (dft_stage_ptr& stage : stages)
+    for (dft_stage_ptr<T>& stage : self->stages)
     {
-        stage->data = data.data() + offset;
-        stage->initialize(this->size);
-        offset += stage->data_size;
+        initialize_data_stage(self, stage, offset);
     }
+    return offset;
+}
 
+template <typename T>
+KFR_INTRINSIC void initialize_order(dft_plan<T>* self)
+{
     bool to_scratch     = false;
     bool scratch_needed = false;
-    for (dft_stage_ptr& stage : reversed(stages))
+    for (dft_stage_ptr<T>& stage : reversed(self->stages))
     {
         if (to_scratch)
         {
@@ -909,141 +917,47 @@ void dft_plan<T>::initialize()
             to_scratch = !to_scratch;
         }
     }
-    if (scratch_needed || !stages[0]->can_inplace)
-        this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
-}
-
-template <typename T>
-const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
-                                         const complex<T>* scratch, bool in_scratch) const
-{
-    if (stage == 0)
-        return in_scratch ? scratch : in;
-    return stages[stage - 1]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
-{
-    return stages[stage]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-template <bool inverse>
-void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
-{
-    if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
-    {
-        return stages[0]->execute(cbool<inverse>, out, in, temp);
-    }
-    size_t stack[32] = { 0 };
-
-    complex<T>* scratch =
-        ptr_cast<complex<T>>(temp + this->temp_size -
-                             align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
-
-    bool in_scratch = !stages[0]->can_inplace && in == out;
-    if (in_scratch)
-    {
-        builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
-    }
-
-    const size_t count = stages.size();
-
-    for (size_t depth = 0; depth < count;)
-    {
-        if (stages[depth]->recursion)
-        {
-            size_t offset   = 0;
-            size_t rdepth   = depth;
-            size_t maxdepth = depth;
-            do
-            {
-                if (stack[rdepth] == stages[rdepth]->repeats)
-                {
-                    stack[rdepth] = 0;
-                    rdepth--;
-                }
-                else
-                {
-                    complex<T>* rout      = select_out(rdepth, out, scratch);
-                    const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
-                    stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
-                    offset += stages[rdepth]->out_offset;
-                    stack[rdepth]++;
-                    if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
-                        rdepth++;
-                    else
-                        maxdepth = rdepth;
-                }
-            } while (rdepth != depth);
-            depth = maxdepth + 1;
-        }
-        else
-        {
-            stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
-                                   select_in(depth, out, in, scratch, in_scratch), temp);
-            depth++;
-        }
-    }
+    if (scratch_needed || !self->stages[0]->can_inplace)
+        self->temp_size += align_up(sizeof(complex<T>) * self->size, platform<>::native_cache_alignment);
 }
 
 template <typename T>
-void dft_plan<T>::init_fft(size_t size, dft_order)
+KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order)
 {
     const size_t log2n = ilog2(size);
     cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n,
             [&](auto log2n) {
                 (void)log2n;
                 constexpr size_t log2nv = val_of(decltype(log2n)());
-                this->add_stage<intrinsics::fft_specialization<T, log2nv>>(size);
+                add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size);
             },
             [&]() {
                 cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
-                    this->make_fft(size, is_even, ctrue);
+                    make_fft(self, size, is_even, ctrue);
                     constexpr size_t is_evenv = val_of(decltype(is_even)());
-                    if (need_reorder)
-                        this->add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(size);
+                    add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(self, size);
                 });
             });
 }
 
 template <typename T>
-dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
-{
-    need_reorder = true;
-    if (is_poweroftwo(size))
-    {
-        init_fft(size, order);
-    }
-#ifndef KFR_DFT_NO_NPo2
-    else
-    {
-        init_dft(size, order);
-    }
-#endif
-    initialize();
-}
-
-template <typename T>
-dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
+KFR_INTRINSIC void generate_real_twiddles(dft_plan_real<T>* self, size_t size)
 {
     using namespace intrinsics;
-
     constexpr size_t width = vector_width<T> * 2;
-
     block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
         constexpr size_t width = val_of(decltype(w)());
-        cwrite<width>(rtwiddle.data() + i,
+        cwrite<width>(self->rtwiddle.data() + i,
                       cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
     });
 }
 
 template <typename T>
-void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
+KFR_INTRINSIC void to_fmt(size_t real_size, const complex<T>* rtwiddle, complex<T>* out, const complex<T>* in,
+                          dft_pack_format fmt)
 {
     using namespace intrinsics;
-    size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
+    size_t csize = real_size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
 
     constexpr size_t width = vector_width<T> * 2;
     const cvec<T, 1> dc    = cread<1>(out);
@@ -1053,9 +967,9 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
         i++;
         constexpr size_t width    = val_of(decltype(w)());
         constexpr size_t widthm1  = width - 1;
-        const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
-        const cvec<T, width> fpk  = cread<width>(out + i);
-        const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1)));
+        const cvec<T, width> tw   = cread<width>(rtwiddle + i);
+        const cvec<T, width> fpk  = cread<width>(in + i);
+        const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
 
         const cvec<T, width> f1k = fpk + fpnk;
         const cvec<T, width> f2k = fpk - fpnk;
@@ -1066,7 +980,7 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
 
     {
         size_t k              = csize / 2;
-        const cvec<T, 1> fpk  = cread<1>(out + k);
+        const cvec<T, 1> fpk  = cread<1>(in + k);
         const cvec<T, 1> fpnk = negodd(fpk);
         cwrite<1>(out + k, fpnk);
     }
@@ -1082,11 +996,12 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
 }
 
 template <typename T>
-void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const
+KFR_INTRINSIC void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const complex<T>* in,
+                            dft_pack_format fmt)
 {
     using namespace intrinsics;
 
-    const size_t csize = this->size / 2;
+    const size_t csize = real_size / 2;
 
     cvec<T, 1> dc;
 
@@ -1106,7 +1021,7 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_
         i++;
         constexpr size_t width    = val_of(decltype(w)());
         constexpr size_t widthm1  = width - 1;
-        const cvec<T, width> tw   = cread<width>(rtwiddle.data() + i);
+        const cvec<T, width> tw   = cread<width>(rtwiddle + i);
         const cvec<T, width> fpk  = cread<width>(in + i);
         const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
 
@@ -1127,18 +1042,77 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_
 }
 
 template <typename T>
-dft_plan<T>::~dft_plan()
+void init_dft(dft_plan<T>* self, size_t size, dft_order);
+
+template <typename T>
+KFR_INTRINSIC void initialize_stages(dft_plan<T>* self)
+{
+    if (is_poweroftwo(self->size))
+    {
+        init_fft(self, self->size, dft_order::normal);
+    }
+#ifndef KFR_DFT_NO_NPo2
+    else
+    {
+        init_dft(self, self->size, dft_order::normal);
+    }
+#endif
+}
+
+template <typename T>
+void dft_initialize(dft_plan<T>& plan)
 {
+    initialize_stages(&plan);
+    initialize_data(&plan);
+    initialize_order(&plan);
 }
 
 template <typename T>
-void dft_plan<T>::dump() const
+struct dft_stage_real_repack : dft_stage<T>
 {
-    for (const dft_stage_ptr& s : stages)
+public:
+    dft_stage_real_repack(size_t real_size, dft_pack_format fmt)
+    {
+        this->user       = static_cast<int>(fmt);
+        this->stage_size = real_size;
+        this->data_size  = align_up(sizeof(complex<T>) * (real_size / 4), platform<>::native_cache_alignment);
+    }
+    void do_initialize(size_t) override
     {
-        s->dump();
+        using namespace intrinsics;
+        constexpr size_t width = vector_width<T> * 2;
+        const size_t real_size = this->stage_size;
+        complex<T>* rtwiddle   = ptr_cast<complex<T>>(this->data);
+        block_process(real_size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
+            constexpr size_t width = val_of(decltype(w)());
+            cwrite<width>(rtwiddle + i,
+                          cossin(dup(-constants<T>::pi *
+                                     ((enumerate<T, width>() + i + real_size / 4) / (real_size / 2)))));
+        });
     }
+    void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override
+    {
+        to_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in,
+               static_cast<dft_pack_format>(this->user));
+    }
+    void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override
+    {
+        from_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in,
+                 static_cast<dft_pack_format>(this->user));
+    }
+};
+
+template <typename T>
+void dft_real_initialize(dft_plan_real<T>& plan)
+{
+    initialize_stages(&plan);
+    plan.fmt_stage.reset(new dft_stage_real_repack<T>(plan.size, plan.fmt));
+    plan.data_size += plan.fmt_stage->data_size;
+    size_t offset = initialize_data(&plan);
+    initialize_data_stage(&plan, plan.fmt_stage, offset);
+    initialize_order(&plan);
 }
+
 } // namespace CMT_ARCH_NAME
 
 } // namespace kfr
diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp
@@ -31,19 +31,8 @@ namespace kfr
 {
 inline namespace CMT_ARCH_NAME
 {
-
-template dft_plan<FLOAT>::dft_plan(size_t, dft_order);
-template void dft_plan<FLOAT>::init_fft(size_t, dft_order);
-template dft_plan<FLOAT>::~dft_plan();
-template void dft_plan<FLOAT>::dump() const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out,
-                                           const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out,
-                                           const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template dft_plan_real<FLOAT>::dft_plan_real(size_t);
-template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in,
-                                             kfr::dft_pack_format fmt) const;
-template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const;
+template void dft_initialize<FLOAT>(dft_plan<FLOAT>& plan);
+template void dft_real_initialize<FLOAT>(dft_plan_real<FLOAT>& plan);
 } // namespace CMT_ARCH_NAME
 } // namespace kfr

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	CMakeLists.txt	\|	7	+++++++
A	dft/CMakeLists.txt	\|	32	++++++++++++++++++++++++++++++++
M	include/kfr/cident.h	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/dft/dft_c.h	\|	12	++++++------
M	include/kfr/dft/fft.hpp	\|	236	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	include/kfr/dft/impl/convolution-impl.cpp	\|	14	+++++++-------
M	include/kfr/dft/impl/dft-fft.hpp	\|	75	++++++++++++++-------------------------------------------------------------
M	include/kfr/dft/impl/dft-impl.hpp	\|	37	++++++++++++++++++-------------------
M	include/kfr/dft/impl/dft-src.cpp	\|	28	+++++++++++++---------------
M	include/kfr/dft/impl/dft-templates.hpp	\|	2	+-
M	include/kfr/dft/impl/fft-impl.hpp	\|	246	+++++++++++++++++++++++++++++++++++--------------------------------------------
M	include/kfr/dft/impl/fft-templates.hpp	\|	15	++-------------