kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 03ef5fea15070ffca1574e7399086bc11d650002
parent 976e9ecbc82b42d5ba6b51cbbce42df394e60a11
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Tue, 16 Apr 2019 15:00:37 +0000

DFT refactoring

Diffstat:
MCMakeLists.txt | 7+++++++
Adft/CMakeLists.txt | 32++++++++++++++++++++++++++++++++
Minclude/kfr/cident.h | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/dft/dft_c.h | 12++++++------
Minclude/kfr/dft/fft.hpp | 236++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Minclude/kfr/dft/impl/convolution-impl.cpp | 14+++++++-------
Minclude/kfr/dft/impl/dft-fft.hpp | 75++++++++++++++-------------------------------------------------------------
Minclude/kfr/dft/impl/dft-impl.hpp | 37++++++++++++++++++-------------------
Minclude/kfr/dft/impl/dft-src.cpp | 28+++++++++++++---------------
Minclude/kfr/dft/impl/dft-templates.hpp | 2+-
Minclude/kfr/dft/impl/fft-impl.hpp | 246+++++++++++++++++++++++++++++++++++--------------------------------------------
Minclude/kfr/dft/impl/fft-templates.hpp | 15++-------------
12 files changed, 465 insertions(+), 307 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -58,6 +58,9 @@ include(sources.cmake) option(ENABLE_TESTS "Enable tests and examples" OFF) if (CLANG) option(ENABLE_DFT "Enable DFT and related algorithms. Requires Clang" ON) + if (X86) + option(ENABLE_DFT_MULTIARCH "Build DFT static libraries for various architectures. Requires Clang" OFF) + endif () endif () option(ENABLE_ASMTEST "Enable writing disassembly" OFF) option(REGENERATE_TESTS "Regenerate auto tests" OFF) @@ -176,6 +179,10 @@ if (ENABLE_DFT) else() target_compile_options(kfr_dft PRIVATE -ffast-math) endif() + + if (ENABLE_DFT_MULTIARCH) + add_subdirectory(dft) + endif () endif() add_library(kfr_io ${KFR_IO_SRC}) diff --git a/dft/CMakeLists.txt b/dft/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (C) 2016 D Levin (http://www.kfrlib.com) +# This file is part of KFR +# +# KFR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# KFR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with KFR. + + +cmake_minimum_required(VERSION 3.1) + +function(add_dft_library ARCH) + add_library(kfr_dft_${ARCH} STATIC ${KFR_DFT_SRC}) + target_link_libraries(kfr_dft_${ARCH} kfr) + target_set_arch(kfr_dft_${ARCH} INTERFACE ${ARCH}) + target_compile_options(kfr_dft_${ARCH} PRIVATE -Xclang -ffast-math) +endfunction() + +add_dft_library(sse2) +add_dft_library(sse41) +add_dft_library(avx) +add_dft_library(avx2) +add_dft_library(avx512) +\ No newline at end of file diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -160,6 +160,74 @@ extern char* gets(char* __s); #define CMT_ARCH_NAME generic #endif +#define CMT_ARCH_ID_GENERIC 0 + +#define CMT_ARCH_ID_SSE2 1 +#define CMT_ARCH_ID_SSE3 2 +#define CMT_ARCH_ID_SSSE3 3 +#define CMT_ARCH_ID_SSE41 4 +#define CMT_ARCH_ID_SSE42 5 +#define CMT_ARCH_ID_AVX 6 +#define CMT_ARCH_ID_AVX2 7 +#define CMT_ARCH_ID_AVX512 8 + +#define CMT_ARCH_ID_NEON 1 +#define CMT_ARCH_ID_NEON64 2 + +#ifdef CMT_ENABLE_SSE2 +#define CMT_EXPAND_IF_ARCH_sse2(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_sse2(...) +#endif + +#ifdef CMT_ENABLE_SSE3 +#define CMT_EXPAND_IF_ARCH_sse3(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_sse3(...) +#endif + +#ifdef CMT_ENABLE_SSSE3 +#define CMT_EXPAND_IF_ARCH_ssse3(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_ssse3(...) +#endif + +#ifdef CMT_ENABLE_SSE41 +#define CMT_EXPAND_IF_ARCH_sse41(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_sse41(...) +#endif + +#ifdef CMT_ENABLE_SSE41 +#define CMT_EXPAND_IF_ARCH_sse41(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_sse41(...) +#endif + +#ifdef CMT_ENABLE_SSE42 +#define CMT_EXPAND_IF_ARCH_sse42(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_sse42(...) +#endif + +#ifdef CMT_ENABLE_AVX +#define CMT_EXPAND_IF_ARCH_avx(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_avx(...) +#endif + +#ifdef CMT_ENABLE_AVX2 +#define CMT_EXPAND_IF_ARCH_avx2(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_avx2(...) +#endif + +#ifdef CMT_ENABLE_AVX512 +#define CMT_EXPAND_IF_ARCH_avx512(...) __VA_ARGS__ +#else +#define CMT_EXPAND_IF_ARCH_avx512(...) +#endif + #ifndef CMT_NO_NATIVE_F64 #define CMT_NATIVE_F64 1 #endif diff --git a/include/kfr/dft/dft_c.h b/include/kfr/dft/dft_c.h @@ -82,18 +82,18 @@ extern "C" // Real DFT plans - KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size); - KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size); + KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size, KFR_DFT_PACK_FORMAT pack_format); + KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size, KFR_DFT_PACK_FORMAT pack_format); void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, const float* in, - uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format); + uint8_t* temp); void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, const double* in, - uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format); + uint8_t* temp); void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, - const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format); + const float* in, uint8_t* temp); void kfr_dft_execute_real_inverse_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, - const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format); + const double* in, uint8_t* temp); void kfr_dft_delete_real_plan_f32(KFR_DFT_REAL_PLAN_F32* plan); void kfr_dft_delete_real_plan_f64(KFR_DFT_REAL_PLAN_F64* plan); diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp @@ -47,6 +47,53 @@ CMT_PRAGMA_MSVC(warning(disable : 4100)) namespace kfr { +using cdirect_t = cfalse_t; +using cinvert_t = ctrue_t; + +template <typename T> +struct dft_stage +{ + size_t radix = 0; + size_t stage_size = 0; + size_t data_size = 0; + size_t temp_size = 0; + u8* data = nullptr; + size_t repeats = 1; + size_t out_offset = 0; + size_t blocks = 0; + size_t user = 0; + const char* name = nullptr; + bool recursion = false; + bool can_inplace = true; + bool inplace = false; + bool to_scratch = false; + bool need_reorder = true; + + void initialize(size_t size) { do_initialize(size); } + + virtual void dump() const + { + printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix, + stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace, + to_scratch); + } + + KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) + { + do_execute(cdirect_t(), out, in, temp); + } + KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) + { + do_execute(cinvert_t(), out, in, temp); + } + virtual ~dft_stage() {} + +protected: + virtual void do_initialize(size_t) {} + virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0; + virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0; +}; + enum class dft_type { both, @@ -60,24 +107,52 @@ enum class dft_order internal, // possibly bit/digit-reversed, implementation-defined, faster to compute }; -inline namespace CMT_ARCH_NAME +enum class dft_pack_format { + Perm, // {X[0].r, X[N].r}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i} + CCs // {X[0].r, 0}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}, {X[N].r, 0} +}; + +template <typename T> +struct dft_plan; + +template <typename T> +struct dft_plan_real; template <typename T> struct dft_stage; +template <typename T> +using dft_stage_ptr = std::unique_ptr<dft_stage<T>>; + +inline namespace CMT_ARCH_NAME +{ +template <typename T> +void dft_initialize(dft_plan<T>& plan); +template <typename T> +void dft_real_initialize(dft_plan_real<T>& plan); +} // namespace CMT_ARCH_NAME + /// @brief Class for performing DFT/FFT template <typename T> struct dft_plan { - using dft_stage_ptr = std::unique_ptr<dft_stage<T>>; - size_t size; size_t temp_size; - dft_plan(size_t size, dft_order order = dft_order::normal); + explicit dft_plan(size_t size, dft_order order = dft_order::normal) + : size(size), temp_size(0), data_size(0) + { + dft_initialize(*this); + } - void dump() const; + void dump() const + { + for (const std::unique_ptr<dft_stage<T>>& s : stages) + { + s->dump(); + } + } KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const @@ -87,7 +162,7 @@ struct dft_plan else execute_dft(cfalse, out, in, temp); } - ~dft_plan(); + ~dft_plan() {} template <bool inverse> KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp, cbool_t<inverse> inv) const @@ -111,44 +186,102 @@ struct dft_plan execute_dft(inv, out.data(), in.data(), temp.data()); } -protected: autofree<u8> data; size_t data_size; - std::vector<dft_stage_ptr> stages; - bool need_reorder; - - template <typename Stage, typename... Args> - void add_stage(Args... args); - - template <bool is_final> - void prepare_dft_stage(size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>); - - template <bool is_even, bool first> - void make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>); - - void initialize(); - template <bool inverse> - void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const; + std::vector<dft_stage_ptr<T>> stages; +protected: + struct noinit + { + }; + explicit dft_plan(noinit, size_t size, dft_order order = dft_order::normal) + : size(size), temp_size(0), data_size(0) + { + } const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, - const complex<T>* scratch, bool in_scratch) const; - complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const; - - void init_dft(size_t size, dft_order order); - void init_fft(size_t size, dft_order order); -}; + const complex<T>* scratch, bool in_scratch) const + { + if (stage == 0) + return in_scratch ? scratch : in; + return stages[stage - 1]->to_scratch ? scratch : out; + } + complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const + { + return stages[stage]->to_scratch ? scratch : out; + } -enum class dft_pack_format -{ - Perm, // {X[0].r, X[N].r}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i} - CCs // {X[0].r, 0}, ... {X[i].r, X[i].i}, ... {X[N-1].r, X[N-1].i}, {X[N].r, 0} + template <bool inverse> + void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const + { + if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) + { + return stages[0]->execute(cbool<inverse>, out, in, temp); + } + size_t stack[32] = { 0 }; + + complex<T>* scratch = ptr_cast<complex<T>>( + temp + this->temp_size - + align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment)); + + bool in_scratch = !stages[0]->can_inplace && in == out; + if (in_scratch) + { + builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size); + } + + const size_t count = stages.size(); + + for (size_t depth = 0; depth < count;) + { + if (stages[depth]->recursion) + { + size_t offset = 0; + size_t rdepth = depth; + size_t maxdepth = depth; + do + { + if (stack[rdepth] == stages[rdepth]->repeats) + { + stack[rdepth] = 0; + rdepth--; + } + else + { + complex<T>* rout = select_out(rdepth, out, scratch); + const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch); + stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); + offset += stages[rdepth]->out_offset; + stack[rdepth]++; + if (rdepth < count - 1 && stages[rdepth + 1]->recursion) + rdepth++; + else + maxdepth = rdepth; + } + } while (rdepth != depth); + depth = maxdepth + 1; + } + else + { + stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch), + select_in(depth, out, in, scratch, in_scratch), temp); + depth++; + } + } + } }; template <typename T> struct dft_plan_real : dft_plan<T> { size_t size; - dft_plan_real(size_t size); + dft_pack_format fmt; + dft_stage_ptr<T> fmt_stage; + + explicit dft_plan_real(size_t size, dft_pack_format fmt = dft_pack_format::CCs) + : dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt) + { + dft_real_initialize(*this); + } void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete; @@ -163,45 +296,50 @@ struct dft_plan_real : dft_plan<T> void execute(univector<complex<T>, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&, cbool_t<inverse>) const = delete; - KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp, - dft_pack_format fmt = dft_pack_format::CCs) const + KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp) const { this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp); - to_fmt(out, fmt); + fmt_stage->execute(cfalse, out, out, nullptr); } - KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp, - dft_pack_format fmt = dft_pack_format::CCs) const + KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp) const { complex<T>* outdata = ptr_cast<complex<T>>(out); - from_fmt(outdata, in, fmt); + fmt_stage->execute(cfalse, outdata, in, nullptr); this->execute_dft(ctrue, outdata, outdata, temp); } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in, - univector<u8, Tag3>& temp, - dft_pack_format fmt = dft_pack_format::CCs) const + univector<u8, Tag3>& temp) const { this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data()); - to_fmt(out.data(), fmt); + fmt_stage->execute(cfalse, out.data(), out.data(), nullptr); } template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in, - univector<u8, Tag3>& temp, - dft_pack_format fmt = dft_pack_format::CCs) const + univector<u8, Tag3>& temp) const { complex<T>* outdata = ptr_cast<complex<T>>(out.data()); - from_fmt(outdata, in.data(), fmt); + fmt_stage->execute(ctrue, outdata, in.data(), nullptr); this->execute_dft(ctrue, outdata, outdata, temp.data()); } -private: - univector<complex<T>> rtwiddle; + // Deprecated. fmt must be passed to constructor instead + void execute(complex<T>*, const T*, u8*, dft_pack_format) const = delete; + void execute(T*, const complex<T>*, u8*, dft_pack_format) const = delete; - void to_fmt(complex<T>* out, dft_pack_format fmt) const; - void from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const; + // Deprecated. fmt must be passed to constructor instead + template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> + void execute(univector<complex<T>, Tag1>&, const univector<T, Tag2>&, univector<u8, Tag3>&, + dft_pack_format) const = delete; + template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> + void execute(univector<T, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&, + dft_pack_format) const = delete; }; +inline namespace CMT_ARCH_NAME +{ + template <typename T, univector_tag Tag1, univector_tag Tag2, univector_tag Tag3> void fft_multiply(univector<complex<T>, Tag1>& dest, const univector<complex<T>, Tag2>& src1, const univector<complex<T>, Tag3>& src2, dft_pack_format fmt = dft_pack_format::CCs) diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp @@ -82,16 +82,16 @@ univector<T> autocorrelate(const univector_ref<const T>& src1) template <typename T> convolve_filter<T>::convolve_filter(size_t size, size_t block_size) - : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size), - segments((size + block_size - 1) / block_size) + : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm), + temp(fft.temp_size), segments((size + block_size - 1) / block_size) { } template <typename T> convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size) - : size(data.size()), block_size(next_poweroftwo(block_size)), fft(2 * next_poweroftwo(block_size)), - temp(fft.temp_size), + : size(data.size()), block_size(next_poweroftwo(block_size)), + fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm), temp(fft.temp_size), segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)), input_position(0), position(0) @@ -110,7 +110,7 @@ void convolve_filter<T>::set_data(const univector<T>& data) ir_segments[i].resize(block_size, 0); input = padded(data.slice(i * block_size, block_size)); - fft.execute(ir_segments[i], input, temp, dft_pack_format::Perm); + fft.execute(ir_segments[i], input, temp); process(ir_segments[i], ir_segments[i] * ifftsize); } saved_input.resize(block_size, 0); @@ -130,7 +130,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T)); process(scratch, padded(saved_input)); - fft.execute(segments[position], scratch, temp, dft_pack_format::Perm); + fft.execute(segments[position], scratch, temp); if (input_position == 0) { @@ -143,7 +143,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) } fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm); - fft.execute(scratch, cscratch, temp, dft_pack_format::Perm); + fft.execute(scratch, cscratch, temp); process(make_univector(output + processed, processing), scratch.slice(input_position) + overlap.slice(input_position)); diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp @@ -38,60 +38,6 @@ namespace kfr { -inline namespace CMT_ARCH_NAME -{ - -#define DFT_ASSERT TESTO_ASSERT_INACTIVE - -template <typename T> -constexpr size_t fft_vector_width = vector_width<T>; - -using cdirect_t = cfalse_t; -using cinvert_t = ctrue_t; - -template <typename T> -struct dft_stage -{ - size_t radix = 0; - size_t stage_size = 0; - size_t data_size = 0; - size_t temp_size = 0; - u8* data = nullptr; - size_t repeats = 1; - size_t out_offset = 0; - size_t blocks = 0; - const char* name = nullptr; - bool recursion = false; - bool can_inplace = true; - bool inplace = false; - bool to_scratch = false; - bool need_reorder = true; - - void initialize(size_t size) { do_initialize(size); } - - virtual void dump() const - { - printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix, - stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace, - to_scratch); - } - - KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) - { - do_execute(cdirect_t(), out, in, temp); - } - KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) - { - do_execute(cinvert_t(), out, in, temp); - } - virtual ~dft_stage() {} - -protected: - virtual void do_initialize(size_t) {} - virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0; - virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0; -}; - #define DFT_STAGE_FN \ void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \ { \ @@ -102,20 +48,27 @@ protected: return do_execute<true>(out, in, temp); \ } +inline namespace CMT_ARCH_NAME +{ + +#define DFT_ASSERT TESTO_ASSERT_INACTIVE + +template <typename T> +constexpr size_t fft_vector_width = vector_width<T>; + CMT_PRAGMA_GNU(GCC diagnostic push) #if CMT_HAS_WARNING("-Wassume") CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") #endif -template <typename T> -template <typename Stage, typename... Args> -void dft_plan<T>::add_stage(Args... args) +template <typename Stage, typename T, typename... Args> +void add_stage(dft_plan<T>* self, Args... args) { dft_stage<T>* stage = new Stage(args...); - stage->need_reorder = need_reorder; - this->data_size += stage->data_size; - this->temp_size += stage->temp_size; - stages.push_back(dft_stage_ptr(stage)); + stage->need_reorder = true; + self->data_size += stage->data_size; + self->temp_size += stage->temp_size; + self->stages.push_back(dft_stage_ptr<T>(stage)); } } // namespace CMT_ARCH_NAME diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp @@ -166,7 +166,7 @@ struct fft_inverse : internal::expression_with_arguments<E> } friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, cinput_t input, - size_t index, vec_shape<value_type, 1>) + size_t index, vec_shape<value_type, 1>) { return self.argument_first(input, index == 0 ? 0 : self.size() - index, vec_shape<value_type, 1>()); } @@ -459,32 +459,31 @@ protected: }; } // namespace intrinsics -template <typename T> -template <bool is_final> -void dft_plan<T>::prepare_dft_stage(size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>) +template <bool is_final, typename T> +void prepare_dft_stage(dft_plan<T>* self, size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>) { return cswitch( dft_radices, radix, - [this, iterations, blocks](auto radix) CMT_INLINE_LAMBDA { + [self, iterations, blocks](auto radix) CMT_INLINE_LAMBDA { add_stage<conditional<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>, - intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations, - blocks); + intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(self, radix, + iterations, blocks); }, - [this, radix, iterations, blocks]() { - add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks); + [self, radix, iterations, blocks]() { + add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(self, radix, iterations, blocks); }); } template <typename T> -void dft_plan<T>::init_dft(size_t size, dft_order) +void init_dft(dft_plan<T>* self, size_t size, dft_order) { if (size == 60) { - this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(); + add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(self); } else if (size == 48) { - this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(); + add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(self); } else { @@ -504,7 +503,7 @@ void dft_plan<T>::init_dft(size_t size, dft_order) if (cur_size >= 101) { - this->add_stage<intrinsics::dft_arblen_stage_impl<T>>(size); + add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size); } else { @@ -518,9 +517,9 @@ void dft_plan<T>::init_dft(size_t size, dft_order) iterations /= r; radices[radices_size++] = r; if (iterations == 1) - this->prepare_dft_stage(r, iterations, blocks, ctrue); + prepare_dft_stage(self, r, iterations, blocks, ctrue); else - this->prepare_dft_stage(r, iterations, blocks, cfalse); + prepare_dft_stage(self, r, iterations, blocks, cfalse); blocks *= r; } } @@ -530,13 +529,13 @@ void dft_plan<T>::init_dft(size_t size, dft_order) iterations /= cur_size; radices[radices_size++] = cur_size; if (iterations == 1) - this->prepare_dft_stage(cur_size, iterations, blocks, ctrue); + prepare_dft_stage(self, cur_size, iterations, blocks, ctrue); else - this->prepare_dft_stage(cur_size, iterations, blocks, cfalse); + prepare_dft_stage(self, cur_size, iterations, blocks, cfalse); } - if (stages.size() > 2) - this->add_stage<intrinsics::dft_reorder_stage_impl<T>>(radices, radices_size); + if (self->stages.size() > 2) + add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size); } } } diff --git a/include/kfr/dft/impl/dft-src.cpp b/include/kfr/dft/impl/dft-src.cpp @@ -80,42 +80,40 @@ extern "C" // Real DFT plans - KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size) + KFR_DFT_REAL_PLAN_F32* kfr_dft_create_real_plan_f32(size_t size, KFR_DFT_PACK_FORMAT pack_format) { - return reinterpret_cast<KFR_DFT_REAL_PLAN_F32*>(new kfr::dft_plan_real<float>(size)); + return reinterpret_cast<KFR_DFT_REAL_PLAN_F32*>( + new kfr::dft_plan_real<float>(size, static_cast<dft_pack_format>(pack_format))); } - KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size) + KFR_DFT_REAL_PLAN_F64* kfr_dft_create_real_plan_f64(size_t size, KFR_DFT_PACK_FORMAT pack_format) { - return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(new kfr::dft_plan_real<double>(size)); + return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>( + new kfr::dft_plan_real<double>(size, static_cast<dft_pack_format>(pack_format))); } void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in, - uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) + uint8_t* temp) { reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute( - reinterpret_cast<kfr::complex<float>*>(out), in, temp, - static_cast<kfr::dft_pack_format>(pack_format)); + reinterpret_cast<kfr::complex<float>*>(out), in, temp); } void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in, - uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) + uint8_t* temp) { reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute( - reinterpret_cast<kfr::complex<double>*>(out), in, temp, - static_cast<kfr::dft_pack_format>(pack_format)); + reinterpret_cast<kfr::complex<double>*>(out), in, temp); } void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in, - uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) + uint8_t* temp) { reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute( - out, reinterpret_cast<const kfr::complex<float>*>(in), temp, - static_cast<kfr::dft_pack_format>(pack_format)); + out, reinterpret_cast<const kfr::complex<float>*>(in), temp); } void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format) { reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute( - out, reinterpret_cast<const kfr::complex<double>*>(in), temp, - static_cast<kfr::dft_pack_format>(pack_format)); + out, reinterpret_cast<const kfr::complex<double>*>(in), temp); } void kfr_dft_delete_real_plan_f32(KFR_DFT_REAL_PLAN_F32* plan) diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp @@ -33,7 +33,7 @@ inline namespace CMT_ARCH_NAME { #ifndef KFR_DFT_NO_NPo2 -template void dft_plan<FLOAT>::init_dft(size_t, dft_order); +template void init_dft(dft_plan<FLOAT>*, size_t, dft_order); #endif } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp @@ -559,20 +559,18 @@ struct fft_reorder_stage_impl : dft_stage<T> { this->name = type_name<decltype(*this)>(); this->stage_size = stage_size; - log2n = ilog2(stage_size); + this->user = ilog2(stage_size); this->data_size = 0; } protected: - size_t log2n; - virtual void do_initialize(size_t) override final {} DFT_STAGE_FN template <bool inverse> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*) { - fft_reorder(out, log2n, cbool_t<!is_even>()); + fft_reorder(out, this->user, cbool_t<!is_even>()); } }; @@ -841,21 +839,20 @@ struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> } // namespace intrinsics -template <typename T> -template <bool is_even, bool first> -void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>) +template <bool is_even, bool first, typename T> +KFR_INTRINSIC void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>, cbool_t<first>) { constexpr size_t final_size = is_even ? 1024 : 512; if (stage_size >= 2048) { - add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(stage_size); + add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size); - make_fft(stage_size / 4, cbool_t<is_even>(), cfalse); + make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse); } else { - add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(final_size); + add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size); } } @@ -866,38 +863,49 @@ struct reverse_wrapper }; template <typename T> -auto begin(reverse_wrapper<T> w) +KFR_INTRINSIC auto begin(reverse_wrapper<T> w) { return std::rbegin(w.iterable); } template <typename T> -auto end(reverse_wrapper<T> w) +KFR_INTRINSIC auto end(reverse_wrapper<T> w) { return std::rend(w.iterable); } template <typename T> -reverse_wrapper<T> reversed(T&& iterable) +KFR_INTRINSIC reverse_wrapper<T> reversed(T&& iterable) { return { iterable }; } template <typename T> -void dft_plan<T>::initialize() +KFR_INTRINSIC void initialize_data_stage(dft_plan<T>* self, const dft_stage_ptr<T>& stage, size_t& offset) +{ + stage->data = self->data.data() + offset; + stage->initialize(self->size); + offset += stage->data_size; +} + +template <typename T> +KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self) { - data = autofree<u8>(data_size); + self->data = autofree<u8>(self->data_size); size_t offset = 0; - for (dft_stage_ptr& stage : stages) + for (dft_stage_ptr<T>& stage : self->stages) { - stage->data = data.data() + offset; - stage->initialize(this->size); - offset += stage->data_size; + initialize_data_stage(self, stage, offset); } + return offset; +} +template <typename T> +KFR_INTRINSIC void initialize_order(dft_plan<T>* self) +{ bool to_scratch = false; bool scratch_needed = false; - for (dft_stage_ptr& stage : reversed(stages)) + for (dft_stage_ptr<T>& stage : reversed(self->stages)) { if (to_scratch) { @@ -909,141 +917,47 @@ void dft_plan<T>::initialize() to_scratch = !to_scratch; } } - if (scratch_needed || !stages[0]->can_inplace) - this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment); -} - -template <typename T> -const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in, - const complex<T>* scratch, bool in_scratch) const -{ - if (stage == 0) - return in_scratch ? scratch : in; - return stages[stage - 1]->to_scratch ? scratch : out; -} - -template <typename T> -complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const -{ - return stages[stage]->to_scratch ? scratch : out; -} - -template <typename T> -template <bool inverse> -void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const -{ - if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) - { - return stages[0]->execute(cbool<inverse>, out, in, temp); - } - size_t stack[32] = { 0 }; - - complex<T>* scratch = - ptr_cast<complex<T>>(temp + this->temp_size - - align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment)); - - bool in_scratch = !stages[0]->can_inplace && in == out; - if (in_scratch) - { - builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size); - } - - const size_t count = stages.size(); - - for (size_t depth = 0; depth < count;) - { - if (stages[depth]->recursion) - { - size_t offset = 0; - size_t rdepth = depth; - size_t maxdepth = depth; - do - { - if (stack[rdepth] == stages[rdepth]->repeats) - { - stack[rdepth] = 0; - rdepth--; - } - else - { - complex<T>* rout = select_out(rdepth, out, scratch); - const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch); - stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); - offset += stages[rdepth]->out_offset; - stack[rdepth]++; - if (rdepth < count - 1 && stages[rdepth + 1]->recursion) - rdepth++; - else - maxdepth = rdepth; - } - } while (rdepth != depth); - depth = maxdepth + 1; - } - else - { - stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch), - select_in(depth, out, in, scratch, in_scratch), temp); - depth++; - } - } + if (scratch_needed || !self->stages[0]->can_inplace) + self->temp_size += align_up(sizeof(complex<T>) * self->size, platform<>::native_cache_alignment); } template <typename T> -void dft_plan<T>::init_fft(size_t size, dft_order) +KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order) { const size_t log2n = ilog2(size); cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n, [&](auto log2n) { (void)log2n; constexpr size_t log2nv = val_of(decltype(log2n)()); - this->add_stage<intrinsics::fft_specialization<T, log2nv>>(size); + add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size); }, [&]() { cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { - this->make_fft(size, is_even, ctrue); + make_fft(self, size, is_even, ctrue); constexpr size_t is_evenv = val_of(decltype(is_even)()); - if (need_reorder) - this->add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(size); + add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(self, size); }); }); } template <typename T> -dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0) -{ - need_reorder = true; - if (is_poweroftwo(size)) - { - init_fft(size, order); - } -#ifndef KFR_DFT_NO_NPo2 - else - { - init_dft(size, order); - } -#endif - initialize(); -} - -template <typename T> -dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4) +KFR_INTRINSIC void generate_real_twiddles(dft_plan_real<T>* self, size_t size) { using namespace intrinsics; - constexpr size_t width = vector_width<T> * 2; - block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) { constexpr size_t width = val_of(decltype(w)()); - cwrite<width>(rtwiddle.data() + i, + cwrite<width>(self->rtwiddle.data() + i, cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2))))); }); } template <typename T> -void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const +KFR_INTRINSIC void to_fmt(size_t real_size, const complex<T>* rtwiddle, complex<T>* out, const complex<T>* in, + dft_pack_format fmt) { using namespace intrinsics; - size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2 + size_t csize = real_size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2 constexpr size_t width = vector_width<T> * 2; const cvec<T, 1> dc = cread<1>(out); @@ -1053,9 +967,9 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const i++; constexpr size_t width = val_of(decltype(w)()); constexpr size_t widthm1 = width - 1; - const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); - const cvec<T, width> fpk = cread<width>(out + i); - const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1))); + const cvec<T, width> tw = cread<width>(rtwiddle + i); + const cvec<T, width> fpk = cread<width>(in + i); + const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); const cvec<T, width> f1k = fpk + fpnk; const cvec<T, width> f2k = fpk - fpnk; @@ -1066,7 +980,7 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const { size_t k = csize / 2; - const cvec<T, 1> fpk = cread<1>(out + k); + const cvec<T, 1> fpk = cread<1>(in + k); const cvec<T, 1> fpnk = negodd(fpk); cwrite<1>(out + k, fpnk); } @@ -1082,11 +996,12 @@ void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const } template <typename T> -void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const +KFR_INTRINSIC void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const complex<T>* in, + dft_pack_format fmt) { using namespace intrinsics; - const size_t csize = this->size / 2; + const size_t csize = real_size / 2; cvec<T, 1> dc; @@ -1106,7 +1021,7 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_ i++; constexpr size_t width = val_of(decltype(w)()); constexpr size_t widthm1 = width - 1; - const cvec<T, width> tw = cread<width>(rtwiddle.data() + i); + const cvec<T, width> tw = cread<width>(rtwiddle + i); const cvec<T, width> fpk = cread<width>(in + i); const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); @@ -1127,18 +1042,77 @@ void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_ } template <typename T> -dft_plan<T>::~dft_plan() +void init_dft(dft_plan<T>* self, size_t size, dft_order); + +template <typename T> +KFR_INTRINSIC void initialize_stages(dft_plan<T>* self) +{ + if (is_poweroftwo(self->size)) + { + init_fft(self, self->size, dft_order::normal); + } +#ifndef KFR_DFT_NO_NPo2 + else + { + init_dft(self, self->size, dft_order::normal); + } +#endif +} + +template <typename T> +void dft_initialize(dft_plan<T>& plan) { + initialize_stages(&plan); + initialize_data(&plan); + initialize_order(&plan); } template <typename T> -void dft_plan<T>::dump() const +struct dft_stage_real_repack : dft_stage<T> { - for (const dft_stage_ptr& s : stages) +public: + dft_stage_real_repack(size_t real_size, dft_pack_format fmt) + { + this->user = static_cast<int>(fmt); + this->stage_size = real_size; + this->data_size = align_up(sizeof(complex<T>) * (real_size / 4), platform<>::native_cache_alignment); + } + void do_initialize(size_t) override { - s->dump(); + using namespace intrinsics; + constexpr size_t width = vector_width<T> * 2; + const size_t real_size = this->stage_size; + complex<T>* rtwiddle = ptr_cast<complex<T>>(this->data); + block_process(real_size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) { + constexpr size_t width = val_of(decltype(w)()); + cwrite<width>(rtwiddle + i, + cossin(dup(-constants<T>::pi * + ((enumerate<T, width>() + i + real_size / 4) / (real_size / 2))))); + }); } + void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override + { + to_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in, + static_cast<dft_pack_format>(this->user)); + } + void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override + { + from_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in, + static_cast<dft_pack_format>(this->user)); + } +}; + +template <typename T> +void dft_real_initialize(dft_plan_real<T>& plan) +{ + initialize_stages(&plan); + plan.fmt_stage.reset(new dft_stage_real_repack<T>(plan.size, plan.fmt)); + plan.data_size += plan.fmt_stage->data_size; + size_t offset = initialize_data(&plan); + initialize_data_stage(&plan, plan.fmt_stage, offset); + initialize_order(&plan); } + } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp @@ -31,19 +31,8 @@ namespace kfr { inline namespace CMT_ARCH_NAME { - -template dft_plan<FLOAT>::dft_plan(size_t, dft_order); -template void dft_plan<FLOAT>::init_fft(size_t, dft_order); -template dft_plan<FLOAT>::~dft_plan(); -template void dft_plan<FLOAT>::dump() const; -template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out, - const kfr::complex<FLOAT>* in, kfr::u8* temp) const; -template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out, - const kfr::complex<FLOAT>* in, kfr::u8* temp) const; -template dft_plan_real<FLOAT>::dft_plan_real(size_t); -template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in, - kfr::dft_pack_format fmt) const; -template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const; +template void dft_initialize<FLOAT>(dft_plan<FLOAT>& plan); +template void dft_real_initialize<FLOAT>(dft_plan_real<FLOAT>& plan); } // namespace CMT_ARCH_NAME } // namespace kfr