commit 03ef5fea15070ffca1574e7399086bc11d650002
parent 976e9ecbc82b42d5ba6b51cbbce42df394e60a11
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Tue, 16 Apr 2019 15:00:37 +0000
DFT refactoring
Diffstat:
12 files changed, 465 insertions(+), 307 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -58,6 +58,9 @@ include(sources.cmake)
option(ENABLE_TESTS "Enable tests and examples" OFF)
if (CLANG)
option(ENABLE_DFT "Enable DFT and related algorithms. Requires Clang" ON)
+ if (X86)
+ option(ENABLE_DFT_MULTIARCH "Build DFT static libraries for various architectures. Requires Clang" OFF)
+ endif ()
endif ()
option(ENABLE_ASMTEST "Enable writing disassembly" OFF)
option(REGENERATE_TESTS "Regenerate auto tests" OFF)
@@ -176,6 +179,10 @@ if (ENABLE_DFT)
else()
target_compile_options(kfr_dft PRIVATE -ffast-math)
endif()
+
+ if (ENABLE_DFT_MULTIARCH)
+ add_subdirectory(dft)
+ endif ()
endif()
add_library(kfr_io ${KFR_IO_SRC})
diff --git a/dft/CMakeLists.txt b/dft/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+# This file is part of KFR
+#
+# KFR is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# KFR is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with KFR.
+
+
+cmake_minimum_required(VERSION 3.1)
+
+function(add_dft_library ARCH)
+ add_library(kfr_dft_${ARCH} STATIC ${KFR_DFT_SRC})
+ target_link_libraries(kfr_dft_${ARCH} kfr)
+ target_set_arch(kfr_dft_${ARCH} INTERFACE ${ARCH})
+ target_compile_options(kfr_dft_${ARCH} PRIVATE -Xclang -ffast-math)
+endfunction()
+
+add_dft_library(sse2)
+add_dft_library(sse41)
+add_dft_library(avx)
+add_dft_library(avx2)
+add_dft_library(avx512)
+\ No newline at end of file
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -160,6 +160,74 @@ extern char* gets(char* __s);
#define CMT_ARCH_NAME generic
#endif
+#define CMT_ARCH_ID_GENERIC 0
+
+#define CMT_ARCH_ID_SSE2 1
+#define CMT_ARCH_ID_SSE3 2
+#define CMT_ARCH_ID_SSSE3 3
+#define CMT_ARCH_ID_SSE41 4
+#define CMT_ARCH_ID_SSE42 5
+#define CMT_ARCH_ID_AVX 6
+#define CMT_ARCH_ID_AVX2 7
+#define CMT_ARCH_ID_AVX512 8
+
+#define CMT_ARCH_ID_NEON 1
+#define CMT_ARCH_ID_NEON64 2
+
+#ifdef CMT_ENABLE_SSE2
+#define CMT_EXPAND_IF_ARCH_sse2(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse2(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE3
+#define CMT_EXPAND_IF_ARCH_sse3(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse3(...)
+#endif
+
+#ifdef CMT_ENABLE_SSSE3
+#define CMT_EXPAND_IF_ARCH_ssse3(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_ssse3(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE41
+#define CMT_EXPAND_IF_ARCH_sse41(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse41(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE41
+#define CMT_EXPAND_IF_ARCH_sse41(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse41(...)
+#endif
+
+#ifdef CMT_ENABLE_SSE42
+#define CMT_EXPAND_IF_ARCH_sse42(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_sse42(...)
+#endif
+
+#ifdef CMT_ENABLE_AVX
+#define CMT_EXPAND_IF_ARCH_avx(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_avx(...)
+#endif
+
+#ifdef CMT_ENABLE_AVX2
+#define CMT_EXPAND_IF_ARCH_avx2(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_avx2(...)
+#endif
+
+#ifdef CMT_ENABLE_AVX512
+#define CMT_EXPAND_IF_ARCH_avx512(...) __VA_ARGS__
+#else
+#define CMT_EXPAND_IF_ARCH_avx512(...)
+#endif
+
#ifndef CMT_NO_NATIVE_F64
#define CMT_NATIVE_F64 1
#endif
diff --git a/include/kfr/dft/dft_c.h b/include/kfr/dft/dft_c.h
@@ -82,18 +82,18 @@ extern "C"
// Real DFT plans
- KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size);
- KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size);
+ KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size, KFR_DFT_PACK_FORMAT pack_format);
+ KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size, KFR_DFT_PACK_FORMAT pack_format);
void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, const float* in,
- uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+ uint8_t* temp);
void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, const double* in,
- uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+ uint8_t* temp);
void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out,
- const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+ const float* in, uint8_t* temp);
void kfr_dft_execute_real_inverse_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out,
- const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format);
+ const double* in, uint8_t* temp);
void kfr_dft_delete_real_plan_f32(KFR_DFT_REAL_PLAN_F32* plan);
void kfr_dft_delete_real_plan_f64(KFR_DFT_REAL_PLAN_F64* plan);
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -47,6 +47,53 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
namespace kfr
{
+using cdirect_t = cfalse_t;
+using cinvert_t = ctrue_t;
+
+template <typename T>
+struct dft_stage
+{
+ size_t radix = 0;
+ size_t stage_size = 0;
+ size_t data_size = 0;
+ size_t temp_size = 0;
+ u8* data = nullptr;
+ size_t repeats = 1;
+ size_t out_offset = 0;
+ size_t blocks = 0;
+ size_t user = 0;
+ const char* name = nullptr;
+ bool recursion = false;
+ bool can_inplace = true;
+ bool inplace = false;
+ bool to_scratch = false;
+ bool need_reorder = true;
+
+ void initialize(size_t size) { do_initialize(size); }
+
+ virtual void dump() const
+ {
+ printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
+ stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
+ to_scratch);
+ }
+
+ KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ do_execute(cdirect_t(), out, in, temp);
+ }
+ KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ do_execute(cinvert_t(), out, in, temp);
+ }
+ virtual ~dft_stage() {}
+
+protected:
+ virtual void do_initialize(size_t) {}
+ virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+ virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+};
+
enum class dft_type
{
both,
@@ -60,24 +107,52 @@ enum class dft_order
internal, // possibly bit/digit-reversed, implementation-defined, faster to compute
};
-inline namespace CMT_ARCH_NAME
+enum class dft_pack_format
{
+ Perm, // {X[0].r, X[N].r}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}
+ CCs // {X[0].r, 0}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}, {X[N].r, 0}
+};
+
+template <typename T>
+struct dft_plan;
+
+template <typename T>
+struct dft_plan_real;
template <typename T>
struct dft_stage;
+template <typename T>
+using dft_stage_ptr = std::unique_ptr<dft_stage<T>>;
+
+inline namespace CMT_ARCH_NAME
+{
+template <typename T>
+void dft_initialize(dft_plan<T>& plan);
+template <typename T>
+void dft_real_initialize(dft_plan_real<T>& plan);
+} // namespace CMT_ARCH_NAME
+
/// @brief Class for performing DFT/FFT
template <typename T>
struct dft_plan
{
- using dft_stage_ptr = std::unique_ptr<dft_stage<T>>;
-
size_t size;
size_t temp_size;
- dft_plan(size_t size, dft_order order = dft_order::normal);
+ explicit dft_plan(size_t size, dft_order order = dft_order::normal)
+ : size(size), temp_size(0), data_size(0)
+ {
+ dft_initialize(*this);
+ }
- void dump() const;
+ void dump() const
+ {
+ for (const std::unique_ptr<dft_stage<T>>& s : stages)
+ {
+ s->dump();
+ }
+ }
KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
bool inverse = false) const
@@ -87,7 +162,7 @@ struct dft_plan
else
execute_dft(cfalse, out, in, temp);
}
- ~dft_plan();
+ ~dft_plan() {}
template <bool inverse>
KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
cbool_t<inverse> inv) const
@@ -111,44 +186,102 @@ struct dft_plan
execute_dft(inv, out.data(), in.data(), temp.data());
}
-protected:
autofree<u8> data;
size_t data_size;
- std::vector<dft_stage_ptr> stages;
- bool need_reorder;
-
- template <typename Stage, typename... Args>
- void add_stage(Args... args);
-
- template <bool is_final>
- void prepare_dft_stage(size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>);
-
- template <bool is_even, bool first>
- void make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>);
-
- void initialize();
- template <bool inverse>
- void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const;
+ std::vector<dft_stage_ptr<T>> stages;
+protected:
+ struct noinit
+ {
+ };
+ explicit dft_plan(noinit, size_t size, dft_order order = dft_order::normal)
+ : size(size), temp_size(0), data_size(0)
+ {
+ }
const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
- const complex<T>* scratch, bool in_scratch) const;
- complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const;
-
- void init_dft(size_t size, dft_order order);
- void init_fft(size_t size, dft_order order);
-};
+ const complex<T>* scratch, bool in_scratch) const
+ {
+ if (stage == 0)
+ return in_scratch ? scratch : in;
+ return stages[stage - 1]->to_scratch ? scratch : out;
+ }
+ complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
+ {
+ return stages[stage]->to_scratch ? scratch : out;
+ }
-enum class dft_pack_format
-{
- Perm, // {X[0].r, X[N].r}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}
- CCs // {X[0].r, 0}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}, {X[N].r, 0}
+ template <bool inverse>
+ void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
+ {
+ if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+ {
+ return stages[0]->execute(cbool<inverse>, out, in, temp);
+ }
+ size_t stack[32] = { 0 };
+
+ complex<T>* scratch = ptr_cast<complex<T>>(
+ temp + this->temp_size -
+ align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
+
+ bool in_scratch = !stages[0]->can_inplace && in == out;
+ if (in_scratch)
+ {
+ builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
+ }
+
+ const size_t count = stages.size();
+
+ for (size_t depth = 0; depth < count;)
+ {
+ if (stages[depth]->recursion)
+ {
+ size_t offset = 0;
+ size_t rdepth = depth;
+ size_t maxdepth = depth;
+ do
+ {
+ if (stack[rdepth] == stages[rdepth]->repeats)
+ {
+ stack[rdepth] = 0;
+ rdepth--;
+ }
+ else
+ {
+ complex<T>* rout = select_out(rdepth, out, scratch);
+ const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
+ stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
+ offset += stages[rdepth]->out_offset;
+ stack[rdepth]++;
+ if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
+ rdepth++;
+ else
+ maxdepth = rdepth;
+ }
+ } while (rdepth != depth);
+ depth = maxdepth + 1;
+ }
+ else
+ {
+ stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
+ select_in(depth, out, in, scratch, in_scratch), temp);
+ depth++;
+ }
+ }
+ }
};
template <typename T>
struct dft_plan_real : dft_plan<T>
{
size_t size;
- dft_plan_real(size_t size);
+ dft_pack_format fmt;
+ dft_stage_ptr<T> fmt_stage;
+
+ explicit dft_plan_real(size_t size, dft_pack_format fmt = dft_pack_format::CCs)
+ : dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt)
+ {
+ dft_real_initialize(*this);
+ }
void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete;
@@ -163,45 +296,50 @@ struct dft_plan_real : dft_plan<T>
void execute(univector<complex<T>, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&,
cbool_t<inverse>) const = delete;
- KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp,
- dft_pack_format fmt = dft_pack_format::CCs) const
+ KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp) const
{
this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp);
- to_fmt(out, fmt);
+ fmt_stage->execute(cfalse, out, out, nullptr);
}
- KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp,
- dft_pack_format fmt = dft_pack_format::CCs) const
+ KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp) const
{
complex<T>* outdata = ptr_cast<complex<T>>(out);
- from_fmt(outdata, in, fmt);
+ fmt_stage->execute(cfalse, outdata, in, nullptr);
this->execute_dft(ctrue, outdata, outdata, temp);
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in,
- univector<u8, Tag3>& temp,
- dft_pack_format fmt = dft_pack_format::CCs) const
+ univector<u8, Tag3>& temp) const
{
this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data());
- to_fmt(out.data(), fmt);
+ fmt_stage->execute(cfalse, out.data(), out.data(), nullptr);
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
- univector<u8, Tag3>& temp,
- dft_pack_format fmt = dft_pack_format::CCs) const
+ univector<u8, Tag3>& temp) const
{
complex<T>* outdata = ptr_cast<complex<T>>(out.data());
- from_fmt(outdata, in.data(), fmt);
+ fmt_stage->execute(ctrue, outdata, in.data(), nullptr);
this->execute_dft(ctrue, outdata, outdata, temp.data());
}
-private:
- univector<complex<T>> rtwiddle;
+ // Deprecated. fmt must be passed to constructor instead
+ void execute(complex<T>*, const T*, u8*, dft_pack_format) const = delete;
+ void execute(T*, const complex<T>*, u8*, dft_pack_format) const = delete;
- void to_fmt(complex<T>* out, dft_pack_format fmt) const;
- void from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const;
+ // Deprecated. fmt must be passed to constructor instead
+ template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
+ void execute(univector<complex<T>, Tag1>&, const univector<T, Tag2>&, univector<u8, Tag3>&,
+ dft_pack_format) const = delete;
+ template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
+ void execute(univector<T, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&,
+ dft_pack_format) const = delete;
};
+inline namespace CMT_ARCH_NAME
+{
+
template <typename T, univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
void fft_multiply(univector<complex<T>, Tag1>& dest, const univector<complex<T>, Tag2>& src1,
const univector<complex<T>, Tag3>& src2, dft_pack_format fmt = dft_pack_format::CCs)
diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp
@@ -82,16 +82,16 @@ univector<T> autocorrelate(const univector_ref<const T>& src1)
template <typename T>
convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
- : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size),
- segments((size + block_size - 1) / block_size)
+ : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm),
+ temp(fft.temp_size), segments((size + block_size - 1) / block_size)
{
}
template <typename T>
convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size)
- : size(data.size()), block_size(next_poweroftwo(block_size)), fft(2 * next_poweroftwo(block_size)),
- temp(fft.temp_size),
+ : size(data.size()), block_size(next_poweroftwo(block_size)),
+ fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm), temp(fft.temp_size),
segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
input_position(0), position(0)
@@ -110,7 +110,7 @@ void convolve_filter<T>::set_data(const univector<T>& data)
ir_segments[i].resize(block_size, 0);
input = padded(data.slice(i * block_size, block_size));
- fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm);
+ fft.execute(ir_segments[i], input, temp);
process(ir_segments[i], ir_segments[i] * ifftsize);
}
saved_input.resize(block_size, 0);
@@ -130,7 +130,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T));
process(scratch, padded(saved_input));
- fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
+ fft.execute(segments[position], scratch, temp);
if (input_position == 0)
{
@@ -143,7 +143,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
}
fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm);
- fft.execute(scratch, cscratch, temp, dft_pack_format::Perm);
+ fft.execute(scratch, cscratch, temp);
process(make_univector(output + processed, processing),
scratch.slice(input_position) + overlap.slice(input_position));
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -38,60 +38,6 @@
namespace kfr
{
-inline namespace CMT_ARCH_NAME
-{
-
-#define DFT_ASSERT TESTO_ASSERT_INACTIVE
-
-template <typename T>
-constexpr size_t fft_vector_width = vector_width<T>;
-
-using cdirect_t = cfalse_t;
-using cinvert_t = ctrue_t;
-
-template <typename T>
-struct dft_stage
-{
- size_t radix = 0;
- size_t stage_size = 0;
- size_t data_size = 0;
- size_t temp_size = 0;
- u8* data = nullptr;
- size_t repeats = 1;
- size_t out_offset = 0;
- size_t blocks = 0;
- const char* name = nullptr;
- bool recursion = false;
- bool can_inplace = true;
- bool inplace = false;
- bool to_scratch = false;
- bool need_reorder = true;
-
- void initialize(size_t size) { do_initialize(size); }
-
- virtual void dump() const
- {
- printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
- stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
- to_scratch);
- }
-
- KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
- {
- do_execute(cdirect_t(), out, in, temp);
- }
- KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
- {
- do_execute(cinvert_t(), out, in, temp);
- }
- virtual ~dft_stage() {}
-
-protected:
- virtual void do_initialize(size_t) {}
- virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
- virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
-};
-
#define DFT_STAGE_FN \
void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \
{ \
@@ -102,20 +48,27 @@ protected:
return do_execute<true>(out, in, temp); \
}
+inline namespace CMT_ARCH_NAME
+{
+
+#define DFT_ASSERT TESTO_ASSERT_INACTIVE
+
+template <typename T>
+constexpr size_t fft_vector_width = vector_width<T>;
+
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wassume")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
#endif
-template <typename T>
-template <typename Stage, typename... Args>
-void dft_plan<T>::add_stage(Args... args)
+template <typename Stage, typename T, typename... Args>
+void add_stage(dft_plan<T>* self, Args... args)
{
dft_stage<T>* stage = new Stage(args...);
- stage->need_reorder = need_reorder;
- this->data_size += stage->data_size;
- this->temp_size += stage->temp_size;
- stages.push_back(dft_stage_ptr(stage));
+ stage->need_reorder = true;
+ self->data_size += stage->data_size;
+ self->temp_size += stage->temp_size;
+ self->stages.push_back(dft_stage_ptr<T>(stage));
}
} // namespace CMT_ARCH_NAME
diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp
@@ -166,7 +166,7 @@ struct fft_inverse : internal::expression_with_arguments<E>
}
friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, cinput_t input,
- size_t index, vec_shape<value_type, 1>)
+ size_t index, vec_shape<value_type, 1>)
{
return self.argument_first(input, index == 0 ? 0 : self.size() - index, vec_shape<value_type, 1>());
}
@@ -459,32 +459,31 @@ protected:
};
} // namespace intrinsics
-template <typename T>
-template <bool is_final>
-void dft_plan<T>::prepare_dft_stage(size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>)
+template <bool is_final, typename T>
+void prepare_dft_stage(dft_plan<T>* self, size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>)
{
return cswitch(
dft_radices, radix,
- [this, iterations, blocks](auto radix) CMT_INLINE_LAMBDA {
+ [self, iterations, blocks](auto radix) CMT_INLINE_LAMBDA {
add_stage<conditional<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>,
- intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations,
- blocks);
+ intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(self, radix,
+ iterations, blocks);
},
- [this, radix, iterations, blocks]() {
- add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks);
+ [self, radix, iterations, blocks]() {
+ add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(self, radix, iterations, blocks);
});
}
template <typename T>
-void dft_plan<T>::init_dft(size_t size, dft_order)
+void init_dft(dft_plan<T>* self, size_t size, dft_order)
{
if (size == 60)
{
- this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>();
+ add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(self);
}
else if (size == 48)
{
- this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>();
+ add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(self);
}
else
{
@@ -504,7 +503,7 @@ void dft_plan<T>::init_dft(size_t size, dft_order)
if (cur_size >= 101)
{
- this->add_stage<intrinsics::dft_arblen_stage_impl<T>>(size);
+ add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size);
}
else
{
@@ -518,9 +517,9 @@ void dft_plan<T>::init_dft(size_t size, dft_order)
iterations /= r;
radices[radices_size++] = r;
if (iterations == 1)
- this->prepare_dft_stage(r, iterations, blocks, ctrue);
+ prepare_dft_stage(self, r, iterations, blocks, ctrue);
else
- this->prepare_dft_stage(r, iterations, blocks, cfalse);
+ prepare_dft_stage(self, r, iterations, blocks, cfalse);
blocks *= r;
}
}
@@ -530,13 +529,13 @@ void dft_plan<T>::init_dft(size_t size, dft_order)
iterations /= cur_size;
radices[radices_size++] = cur_size;
if (iterations == 1)
- this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+ prepare_dft_stage(self, cur_size, iterations, blocks, ctrue);
else
- this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+ prepare_dft_stage(self, cur_size, iterations, blocks, cfalse);
}
- if (stages.size() > 2)
- this->add_stage<intrinsics::dft_reorder_stage_impl<T>>(radices, radices_size);
+ if (self->stages.size() > 2)
+ add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size);
}
}
}
diff --git a/include/kfr/dft/impl/dft-src.cpp b/include/kfr/dft/impl/dft-src.cpp
@@ -80,42 +80,40 @@ extern "C"
// Real DFT plans
- KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size)
+ KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size, KFR_DFT_PACK_FORMAT pack_format)
{
- return reinterpret_cast<KFR_DFT_REAL_PLAN_F32*>(new kfr::dft_plan_real<float>(size));
+ return reinterpret_cast<KFR_DFT_REAL_PLAN_F32*>(
+ new kfr::dft_plan_real<float>(size, static_cast<dft_pack_format>(pack_format)));
}
- KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size)
+ KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size, KFR_DFT_PACK_FORMAT pack_format)
{
- return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(new kfr::dft_plan_real<double>(size));
+ return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(
+ new kfr::dft_plan_real<double>(size, static_cast<dft_pack_format>(pack_format)));
}
void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
- uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+ uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
- reinterpret_cast<kfr::complex<float>*>(out), in, temp,
- static_cast<kfr::dft_pack_format>(pack_format));
+ reinterpret_cast<kfr::complex<float>*>(out), in, temp);
}
void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
- uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+ uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
- reinterpret_cast<kfr::complex<double>*>(out), in, temp,
- static_cast<kfr::dft_pack_format>(pack_format));
+ reinterpret_cast<kfr::complex<double>*>(out), in, temp);
}
void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
- uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+ uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
- out, reinterpret_cast<const kfr::complex<float>*>(in), temp,
- static_cast<kfr::dft_pack_format>(pack_format));
+ out, reinterpret_cast<const kfr::complex<float>*>(in), temp);
}
void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
{
reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
- out, reinterpret_cast<const kfr::complex<double>*>(in), temp,
- static_cast<kfr::dft_pack_format>(pack_format));
+ out, reinterpret_cast<const kfr::complex<double>*>(in), temp);
}
void kfr_dft_delete_real_plan_f32(KFR_DFT_REAL_PLAN_F32* plan)
diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp
@@ -33,7 +33,7 @@ inline namespace CMT_ARCH_NAME
{
#ifndef KFR_DFT_NO_NPo2
-template void dft_plan<FLOAT>::init_dft(size_t, dft_order);
+template void init_dft(dft_plan<FLOAT>*, size_t, dft_order);
#endif
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -559,20 +559,18 @@ struct fft_reorder_stage_impl : dft_stage<T>
{
this->name = type_name<decltype(*this)>();
this->stage_size = stage_size;
- log2n = ilog2(stage_size);
+ this->user = ilog2(stage_size);
this->data_size = 0;
}
protected:
- size_t log2n;
-
virtual void do_initialize(size_t) override final {}
DFT_STAGE_FN
template <bool inverse>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
{
- fft_reorder(out, log2n, cbool_t<!is_even>());
+ fft_reorder(out, this->user, cbool_t<!is_even>());
}
};
@@ -841,21 +839,20 @@ struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
} // namespace intrinsics
-template <typename T>
-template <bool is_even, bool first>
-void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>)
+template <bool is_even, bool first, typename T>
+KFR_INTRINSIC void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>, cbool_t<first>)
{
constexpr size_t final_size = is_even ? 1024 : 512;
if (stage_size >= 2048)
{
- add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(stage_size);
+ add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size);
- make_fft(stage_size / 4, cbool_t<is_even>(), cfalse);
+ make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
}
else
{
- add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(final_size);
+ add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size);
}
}
@@ -866,38 +863,49 @@ struct reverse_wrapper
};
template <typename T>
-auto begin(reverse_wrapper<T> w)
+KFR_INTRINSIC auto begin(reverse_wrapper<T> w)
{
return std::rbegin(w.iterable);
}
template <typename T>
-auto end(reverse_wrapper<T> w)
+KFR_INTRINSIC auto end(reverse_wrapper<T> w)
{
return std::rend(w.iterable);
}
template <typename T>
-reverse_wrapper<T> reversed(T&& iterable)
+KFR_INTRINSIC reverse_wrapper<T> reversed(T&& iterable)
{
return { iterable };
}
template <typename T>
-void dft_plan<T>::initialize()
+KFR_INTRINSIC void initialize_data_stage(dft_plan<T>* self, const dft_stage_ptr<T>& stage, size_t& offset)
+{
+ stage->data = self->data.data() + offset;
+ stage->initialize(self->size);
+ offset += stage->data_size;
+}
+
+template <typename T>
+KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
{
- data = autofree<u8>(data_size);
+ self->data = autofree<u8>(self->data_size);
size_t offset = 0;
- for (dft_stage_ptr& stage : stages)
+ for (dft_stage_ptr<T>& stage : self->stages)
{
- stage->data = data.data() + offset;
- stage->initialize(this->size);
- offset += stage->data_size;
+ initialize_data_stage(self, stage, offset);
}
+ return offset;
+}
+template <typename T>
+KFR_INTRINSIC void initialize_order(dft_plan<T>* self)
+{
bool to_scratch = false;
bool scratch_needed = false;
- for (dft_stage_ptr& stage : reversed(stages))
+ for (dft_stage_ptr<T>& stage : reversed(self->stages))
{
if (to_scratch)
{
@@ -909,141 +917,47 @@ void dft_plan<T>::initialize()
to_scratch = !to_scratch;
}
}
- if (scratch_needed || !stages[0]->can_inplace)
- this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
-}
-
-template <typename T>
-const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
- const complex<T>* scratch, bool in_scratch) const
-{
- if (stage == 0)
- return in_scratch ? scratch : in;
- return stages[stage - 1]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
-{
- return stages[stage]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-template <bool inverse>
-void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
-{
- if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
- {
- return stages[0]->execute(cbool<inverse>, out, in, temp);
- }
- size_t stack[32] = { 0 };
-
- complex<T>* scratch =
- ptr_cast<complex<T>>(temp + this->temp_size -
- align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
-
- bool in_scratch = !stages[0]->can_inplace && in == out;
- if (in_scratch)
- {
- builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
- }
-
- const size_t count = stages.size();
-
- for (size_t depth = 0; depth < count;)
- {
- if (stages[depth]->recursion)
- {
- size_t offset = 0;
- size_t rdepth = depth;
- size_t maxdepth = depth;
- do
- {
- if (stack[rdepth] == stages[rdepth]->repeats)
- {
- stack[rdepth] = 0;
- rdepth--;
- }
- else
- {
- complex<T>* rout = select_out(rdepth, out, scratch);
- const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
- stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
- offset += stages[rdepth]->out_offset;
- stack[rdepth]++;
- if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
- rdepth++;
- else
- maxdepth = rdepth;
- }
- } while (rdepth != depth);
- depth = maxdepth + 1;
- }
- else
- {
- stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
- select_in(depth, out, in, scratch, in_scratch), temp);
- depth++;
- }
- }
+ if (scratch_needed || !self->stages[0]->can_inplace)
+ self->temp_size += align_up(sizeof(complex<T>) * self->size, platform<>::native_cache_alignment);
}
template <typename T>
-void dft_plan<T>::init_fft(size_t size, dft_order)
+KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order)
{
const size_t log2n = ilog2(size);
cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n,
[&](auto log2n) {
(void)log2n;
constexpr size_t log2nv = val_of(decltype(log2n)());
- this->add_stage<intrinsics::fft_specialization<T, log2nv>>(size);
+ add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size);
},
[&]() {
cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
- this->make_fft(size, is_even, ctrue);
+ make_fft(self, size, is_even, ctrue);
constexpr size_t is_evenv = val_of(decltype(is_even)());
- if (need_reorder)
- this->add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(size);
+ add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(self, size);
});
});
}
template <typename T>
-dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
-{
- need_reorder = true;
- if (is_poweroftwo(size))
- {
- init_fft(size, order);
- }
-#ifndef KFR_DFT_NO_NPo2
- else
- {
- init_dft(size, order);
- }
-#endif
- initialize();
-}
-
-template <typename T>
-dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
+KFR_INTRINSIC void generate_real_twiddles(dft_plan_real<T>* self, size_t size)
{
using namespace intrinsics;
-
constexpr size_t width = vector_width<T> * 2;
-
block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
constexpr size_t width = val_of(decltype(w)());
- cwrite<width>(rtwiddle.data() + i,
+ cwrite<width>(self->rtwiddle.data() + i,
cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
});
}
template <typename T>
-void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
+KFR_INTRINSIC void to_fmt(size_t real_size, const complex<T>* rtwiddle, complex<T>* out, const complex<T>* in,
+ dft_pack_format fmt)
{
using namespace intrinsics;
- size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
+ size_t csize = real_size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
constexpr size_t width = vector_width<T> * 2;
const cvec<T, 1> dc = cread<1>(out);
@@ -1053,9 +967,9 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
i++;
constexpr size_t width = val_of(decltype(w)());
constexpr size_t widthm1 = width - 1;
- const cvec<T, width> tw = cread<width>(rtwiddle.data() + i);
- const cvec<T, width> fpk = cread<width>(out + i);
- const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1)));
+ const cvec<T, width> tw = cread<width>(rtwiddle + i);
+ const cvec<T, width> fpk = cread<width>(in + i);
+ const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
const cvec<T, width> f1k = fpk + fpnk;
const cvec<T, width> f2k = fpk - fpnk;
@@ -1066,7 +980,7 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
{
size_t k = csize / 2;
- const cvec<T, 1> fpk = cread<1>(out + k);
+ const cvec<T, 1> fpk = cread<1>(in + k);
const cvec<T, 1> fpnk = negodd(fpk);
cwrite<1>(out + k, fpnk);
}
@@ -1082,11 +996,12 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
}
template <typename T>
-void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const
+KFR_INTRINSIC void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const complex<T>* in,
+ dft_pack_format fmt)
{
using namespace intrinsics;
- const size_t csize = this->size / 2;
+ const size_t csize = real_size / 2;
cvec<T, 1> dc;
@@ -1106,7 +1021,7 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_
i++;
constexpr size_t width = val_of(decltype(w)());
constexpr size_t widthm1 = width - 1;
- const cvec<T, width> tw = cread<width>(rtwiddle.data() + i);
+ const cvec<T, width> tw = cread<width>(rtwiddle + i);
const cvec<T, width> fpk = cread<width>(in + i);
const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
@@ -1127,18 +1042,77 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_
}
template <typename T>
-dft_plan<T>::~dft_plan()
+void init_dft(dft_plan<T>* self, size_t size, dft_order);
+
+template <typename T>
+KFR_INTRINSIC void initialize_stages(dft_plan<T>* self)
+{
+ if (is_poweroftwo(self->size))
+ {
+ init_fft(self, self->size, dft_order::normal);
+ }
+#ifndef KFR_DFT_NO_NPo2
+ else
+ {
+ init_dft(self, self->size, dft_order::normal);
+ }
+#endif
+}
+
+template <typename T>
+void dft_initialize(dft_plan<T>& plan)
{
+ initialize_stages(&plan);
+ initialize_data(&plan);
+ initialize_order(&plan);
}
template <typename T>
-void dft_plan<T>::dump() const
+struct dft_stage_real_repack : dft_stage<T>
{
- for (const dft_stage_ptr& s : stages)
+public:
+ dft_stage_real_repack(size_t real_size, dft_pack_format fmt)
+ {
+ this->user = static_cast<int>(fmt);
+ this->stage_size = real_size;
+ this->data_size = align_up(sizeof(complex<T>) * (real_size / 4), platform<>::native_cache_alignment);
+ }
+ void do_initialize(size_t) override
{
- s->dump();
+ using namespace intrinsics;
+ constexpr size_t width = vector_width<T> * 2;
+ const size_t real_size = this->stage_size;
+ complex<T>* rtwiddle = ptr_cast<complex<T>>(this->data);
+ block_process(real_size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
+ constexpr size_t width = val_of(decltype(w)());
+ cwrite<width>(rtwiddle + i,
+ cossin(dup(-constants<T>::pi *
+ ((enumerate<T, width>() + i + real_size / 4) / (real_size / 2)))));
+ });
}
+ void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override
+ {
+ to_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in,
+ static_cast<dft_pack_format>(this->user));
+ }
+ void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override
+ {
+ from_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in,
+ static_cast<dft_pack_format>(this->user));
+ }
+};
+
+template <typename T>
+void dft_real_initialize(dft_plan_real<T>& plan)
+{
+ initialize_stages(&plan);
+ plan.fmt_stage.reset(new dft_stage_real_repack<T>(plan.size, plan.fmt));
+ plan.data_size += plan.fmt_stage->data_size;
+ size_t offset = initialize_data(&plan);
+ initialize_data_stage(&plan, plan.fmt_stage, offset);
+ initialize_order(&plan);
}
+
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp
@@ -31,19 +31,8 @@ namespace kfr
{
inline namespace CMT_ARCH_NAME
{
-
-template dft_plan<FLOAT>::dft_plan(size_t, dft_order);
-template void dft_plan<FLOAT>::init_fft(size_t, dft_order);
-template dft_plan<FLOAT>::~dft_plan();
-template void dft_plan<FLOAT>::dump() const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out,
- const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out,
- const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template dft_plan_real<FLOAT>::dft_plan_real(size_t);
-template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in,
- kfr::dft_pack_format fmt) const;
-template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const;
+template void dft_initialize<FLOAT>(dft_plan<FLOAT>& plan);
+template void dft_real_initialize<FLOAT>(dft_plan_real<FLOAT>& plan);
} // namespace CMT_ARCH_NAME
} // namespace kfr