commit 194011494c612013f9dc8d75e96bd7fbd97e9b75
parent a92f4352d611566e7bbf3590d0768a9f444d8c14
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Tue, 27 Nov 2018 20:04:10 +0000
Non-power of 2 DFT
Diffstat:
6 files changed, 723 insertions(+), 247 deletions(-)
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ KFR is a header-only library (except DFT) and has no external dependencies.
* Full AVX-512 support
* EBU R128
-* Non-power of two DFT (coming soon)
+* Non-power of two DFT
* TruePeak (coming soon)
* Number of automatic tests has been increased
* GPL version changed from 3 to 2+
diff --git a/include/kfr/dft/dft-src.cpp b/include/kfr/dft/dft-src.cpp
@@ -45,7 +45,7 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
namespace kfr
{
-constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 10> dft_radices{};
+constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{};
#define DFT_ASSERT TESTO_ASSERT_INACTIVE
@@ -58,22 +58,30 @@ using cinvert_t = ctrue_t;
template <typename T>
struct dft_stage
{
+ size_t radix = 0;
size_t stage_size = 0;
size_t data_size = 0;
size_t temp_size = 0;
u8* data = nullptr;
size_t repeats = 1;
size_t out_offset = 0;
- size_t width = 0;
size_t blocks = 0;
- const char* name;
- bool recursion = false;
- bool can_inplace = true;
- bool inplace = false;
- bool to_scratch = false;
+ const char* name = nullptr;
+ bool recursion = false;
+ bool can_inplace = true;
+ bool inplace = false;
+ bool to_scratch = false;
+ bool need_reorder = true;
void initialize(size_t size) { do_initialize(size); }
+ virtual void dump() const
+ {
+ printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
+ stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
+ to_scratch);
+ }
+
KFR_INTRIN void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
{
do_execute(cdirect_t(), out, in, temp);
@@ -472,7 +480,7 @@ template <typename T>
static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
{
complex<T>* twiddle = ptr_cast<complex<T>>(stage->data);
- const size_t N = stage->repeats * stage->stage_size;
+ const size_t N = stage->repeats * stage->radix;
const size_t Nord = stage->repeats;
size_t i = 0;
@@ -482,7 +490,7 @@ static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
for (; i < Nord / width * width; i += width)
{
CMT_LOOP_NOUNROLL
- for (size_t j = 1; j < stage->stage_size; j++)
+ for (size_t j = 1; j < stage->radix; j++)
{
CMT_LOOP_NOUNROLL
for (size_t k = 0; k < width; k++)
@@ -502,16 +510,17 @@ struct dft_stage_fixed_impl : dft_stage<T>
{
dft_stage_fixed_impl(size_t radix_, size_t iterations, size_t blocks)
{
- this->stage_size = radix;
- this->blocks = blocks;
- this->repeats = iterations;
- this->recursion = false; // true;
+ this->name = type_name<decltype(*this)>();
+ this->radix = radix;
+ this->blocks = blocks;
+ this->repeats = iterations;
+ this->recursion = false; // true;
this->data_size =
align_up((this->repeats * (radix - 1)) * sizeof(complex<T>), platform<>::native_cache_alignment);
}
-protected:
- constexpr static size_t width = fft_vector_width<T>;
+ constexpr static size_t width =
+ radix >= 7 ? fft_vector_width<T> / 2 : radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2;
virtual void do_initialize(size_t size) override final { dft_stage_fixed_initialize(this, width); }
DFT_STAGE_FN
@@ -521,7 +530,7 @@ protected:
const size_t Nord = this->repeats;
const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- const size_t N = Nord * this->stage_size;
+ const size_t N = Nord * this->radix;
CMT_LOOP_NOUNROLL
for (size_t b = 0; b < this->blocks; b++)
{
@@ -537,13 +546,16 @@ struct dft_stage_fixed_final_impl : dft_stage<T>
{
dft_stage_fixed_final_impl(size_t radix_, size_t iterations, size_t blocks)
{
- this->stage_size = radix;
+ this->name = type_name<decltype(*this)>();
+ this->radix = radix;
this->blocks = blocks;
this->repeats = iterations;
- this->recursion = false; // true;
+ this->recursion = false;
this->can_inplace = false;
}
- constexpr static size_t width = fft_vector_width<T>;
+ constexpr static size_t width =
+ radix >= 7 ? fft_vector_width<T> / 2 : radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2;
+
DFT_STAGE_FN
template <bool inverse>
KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
@@ -555,19 +567,163 @@ struct dft_stage_fixed_final_impl : dft_stage<T>
}
};
+template <typename E>
+inline E& apply_conj(E& e, cfalse_t)
+{
+ return e;
+}
+
+template <typename E>
+inline auto apply_conj(E& e, ctrue_t)
+{
+ return cconj(e);
+}
+
+/// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1]
+template <typename E>
+struct fft_inverse : expression_base<E>
+{
+ using value_type = value_type_of<E>;
+
+ CMT_INLINE fft_inverse(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {}
+
+ CMT_INLINE vec<value_type, 1> operator()(cinput_t input, size_t index, vec_t<value_type, 1>) const
+ {
+ return this->argument_first(input, index == 0 ? 0 : this->size() - index, vec_t<value_type, 1>());
+ }
+
+ template <size_t N>
+ CMT_INLINE vec<value_type, N> operator()(cinput_t input, size_t index, vec_t<value_type, N>) const
+ {
+ if (index == 0)
+ {
+ return concat(
+ this->argument_first(input, index, vec_t<value_type, 1>()),
+ reverse(this->argument_first(input, this->size() - (N - 1), vec_t<value_type, N - 1>())));
+ }
+ return reverse(this->argument_first(input, this->size() - index - (N - 1), vec_t<value_type, N>()));
+ }
+};
+
+template <typename E>
+inline auto apply_fft_inverse(E&& e)
+{
+ return fft_inverse<E>(std::forward<E>(e));
+}
+
+template <typename T>
+struct dft_arblen_stage_impl : dft_stage<T>
+{
+ dft_arblen_stage_impl(size_t size)
+ : fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal), size(size)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->radix = size;
+ this->blocks = 1;
+ this->repeats = 1;
+ this->recursion = false;
+ this->can_inplace = false;
+ this->temp_size = plan.temp_size;
+
+ chirp_ = render(cexp(sqr(linspace(T(1) - size, size - T(1), size * 2 - 1, true, true)) *
+ complex<T>(0, -1) * c_pi<T> / size));
+
+ ichirpp_ = render(truncate(padded(1 / slice(chirp_, 0, 2 * size - 1)), fftsize));
+
+ univector<u8> temp(plan.temp_size);
+ plan.execute(ichirpp_, ichirpp_, temp);
+ xp.resize(fftsize, 0);
+ xp_fft.resize(fftsize);
+ invN2 = T(1) / fftsize;
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ const size_t n = this->size;
+ const size_t N2 = this->fftsize;
+
+ auto&& chirp = apply_conj(chirp_, cbool<inverse>);
+
+ xp.slice(0, n) = make_univector(in, n) * slice(chirp, n - 1);
+
+ plan.execute(xp_fft.data(), xp.data(), temp);
+
+ if (inverse)
+ xp_fft = xp_fft * cconj(apply_fft_inverse(ichirpp_));
+ else
+ xp_fft = xp_fft * ichirpp_;
+ plan.execute(xp_fft.data(), xp_fft.data(), temp, ctrue);
+
+ make_univector(out, n) = xp_fft.slice(n - 1) * slice(chirp, n - 1) * invN2;
+ }
+
+ const size_t size;
+ const size_t fftsize;
+ T invN2;
+ dft_plan<T> plan;
+ univector<complex<T>> chirp_;
+ univector<complex<T>> ichirpp_;
+ univector<complex<T>> xp;
+ univector<complex<T>> xp_fft;
+};
+
+template <typename T, size_t radix1, size_t radix2, size_t size = radix1* radix2>
+struct dft_special_stage_impl : dft_stage<T>
+{
+ dft_special_stage_impl() : stage1(radix1, size / radix1, 1), stage2(radix2, 1, size / radix2)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->radix = size;
+ this->blocks = 1;
+ this->repeats = 1;
+ this->recursion = false;
+ this->can_inplace = false;
+ this->temp_size = stage1.temp_size + stage2.temp_size + sizeof(complex<T>) * size;
+ this->data_size = stage1.data_size + stage2.data_size;
+ }
+ void dump() const override
+ {
+ dft_stage<T>::dump();
+ printf(" ");
+ stage1.dump();
+ printf(" ");
+ stage2.dump();
+ }
+ void do_initialize(size_t stage_size) override
+ {
+ stage1.data = this->data;
+ stage2.data = this->data + stage1.data_size;
+ stage1.initialize(stage_size);
+ stage2.initialize(stage_size);
+ }
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size);
+ stage1.do_execute(cbool<inverse>, scratch, in, temp);
+ stage2.do_execute(cbool<inverse>, out, scratch, temp + stage1.temp_size);
+ }
+ dft_stage_fixed_impl<T, radix1> stage1;
+ dft_stage_fixed_final_impl<T, radix2> stage2;
+};
+
template <typename T, bool final>
struct dft_stage_generic_impl : dft_stage<T>
{
dft_stage_generic_impl(size_t radix, size_t iterations, size_t blocks)
{
- this->stage_size = radix;
+ this->name = type_name<decltype(*this)>();
+ this->radix = radix;
this->blocks = blocks;
this->repeats = iterations;
this->recursion = false; // true;
this->can_inplace = false;
this->temp_size = align_up(sizeof(complex<T>) * radix, platform<>::native_cache_alignment);
this->data_size =
- align_up(sizeof(complex<T>) * sqr(this->stage_size / 2), platform<>::native_cache_alignment);
+ align_up(sizeof(complex<T>) * sqr(this->radix / 2), platform<>::native_cache_alignment);
}
protected:
@@ -575,13 +731,12 @@ protected:
{
complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
CMT_LOOP_NOUNROLL
- for (size_t i = 0; i < this->stage_size / 2; i++)
+ for (size_t i = 0; i < this->radix / 2; i++)
{
CMT_LOOP_NOUNROLL
- for (size_t j = 0; j < this->stage_size / 2; j++)
+ for (size_t j = 0; j < this->radix / 2; j++)
{
- cwrite<1>(twiddle++,
- cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->stage_size)));
+ cwrite<1>(twiddle++, cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->radix)));
}
}
}
@@ -593,17 +748,17 @@ protected:
const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
const size_t bl = this->blocks;
const size_t Nord = this->repeats;
- const size_t N = Nord * this->stage_size;
+ const size_t N = Nord * this->radix;
CMT_LOOP_NOUNROLL
for (size_t b = 0; b < bl; b++)
- generic_butterfly(this->stage_size, cbool<inverse>, out + b, in + b * this->stage_size,
+ generic_butterfly(this->radix, cbool<inverse>, out + b, in + b * this->radix,
ptr_cast<complex<T>>(temp), twiddle, bl);
}
};
template <typename T, typename Tr2>
-inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 r2)
+inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 first_radix)
{
CMT_ASSUME(r0 > 1);
CMT_ASSUME(r1 > 1);
@@ -617,20 +772,20 @@ inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t
{
const complex<T>* in2 = in1;
CMT_LOOP_UNROLL
- for (size_t j = 0; j < r2; j++)
+ for (size_t j = 0; j < first_radix; j++)
{
*out++ = *in2;
in2 += r1;
}
in1++;
- in += r2;
+ in += first_radix;
}
}
}
-template <typename T>
+template <typename T, typename Tr2>
inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_t* radices, size_t count,
- size_t index, size_t inscale, size_t inner_size)
+ size_t index, size_t inscale, size_t inner_size, Tr2 first_radix)
{
const bool b = index == 1;
const size_t radix = radices[index];
@@ -641,7 +796,7 @@ inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_
{
const complex<T>* in1 = in;
CMT_LOOP_UNROLL
- for (size_t j = 0; j < radices[0]; j++)
+ for (size_t j = 0; j < first_radix; j++)
{
*out++ = *in1;
in1 += inner_size;
@@ -656,7 +811,7 @@ inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_
CMT_LOOP_NOUNROLL
for (size_t i = 0; i < steps; i++)
{
- dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size);
+ dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size, first_radix);
in += inscale;
}
}
@@ -667,15 +822,15 @@ struct dft_reorder_stage_impl : dft_stage<T>
{
dft_reorder_stage_impl(const int* radices, size_t count) : count(count)
{
+ this->name = type_name<decltype(*this)>();
this->can_inplace = false;
this->data_size = 0;
- std::copy(std::make_reverse_iterator(radices + count), std::make_reverse_iterator(radices),
- this->radices);
+ std::copy(radices, radices + count, this->radices);
this->inner_size = 1;
this->size = 1;
for (size_t r = 0; r < count; r++)
{
- if (r != 0 && r != count - 2)
+ if (r != 0 && r != count - 1)
this->inner_size *= radices[r];
this->size *= radices[r];
}
@@ -692,38 +847,37 @@ protected:
template <bool inverse>
KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
- // std::copy(in, in + this->size, out);
- // return;
- if (count == 3)
- {
- dft_permute(out, in, radices[2], radices[1], radices[0]);
- }
- else
- {
- const size_t rlast = radices[count - 1];
- for (size_t p = 0; p < rlast; p++)
- {
- dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size);
- in += size / rlast;
- }
- }
- // if (count == 1)
- // {
- // cswitch(dft_radices, radices[0], [&](auto rfirst) { dft_permute3(out, in, 1, 1, rfirst);
- // },
- // [&]() { dft_permute3(out, in, 1, 1, radices[0]); });
- // }
-
- // const size_t rlast = radices[count - 1];
- // const size_t size = this->size / rlast;
- //
- // cswitch(dft_radices, radices[0], [&](auto rfirst) {
- // for (size_t p = 0; p < rlast; p++)
- // {
- // reorder_impl(out, in, index, 1, rfirst);
- // in += size;
- // }
- // });
+ cswitch(dft_radices, radices[0],
+ [&](auto first_radix) {
+ if (count == 3)
+ {
+ dft_permute(out, in, radices[2], radices[1], first_radix);
+ }
+ else
+ {
+ const size_t rlast = radices[count - 1];
+ for (size_t p = 0; p < rlast; p++)
+ {
+ dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, first_radix);
+ in += size / rlast;
+ }
+ }
+ },
+ [&]() {
+ if (count == 3)
+ {
+ dft_permute(out, in, radices[2], radices[1], radices[0]);
+ }
+ else
+ {
+ const size_t rlast = radices[count - 1];
+ for (size_t p = 0; p < rlast; p++)
+ {
+ dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, radices[0]);
+ in += size / rlast;
+ }
+ }
+ });
}
};
@@ -732,6 +886,8 @@ struct fft_stage_impl : dft_stage<T>
{
fft_stage_impl(size_t stage_size)
{
+ this->name = type_name<decltype(*this)>();
+ this->radix = 4;
this->stage_size = stage_size;
this->repeats = 4;
this->recursion = true;
@@ -770,6 +926,8 @@ struct fft_final_stage_impl : dft_stage<T>
{
fft_final_stage_impl(size_t)
{
+ this->name = type_name<decltype(*this)>();
+ this->radix = size;
this->stage_size = size;
this->out_offset = size;
this->repeats = 4;
@@ -863,6 +1021,7 @@ struct fft_reorder_stage_impl : dft_stage<T>
{
fft_reorder_stage_impl(size_t stage_size)
{
+ this->name = type_name<decltype(*this)>();
this->stage_size = stage_size;
log2n = ilog2(stage_size);
this->data_size = 0;
@@ -887,7 +1046,7 @@ struct fft_specialization;
template <typename T>
struct fft_specialization<T, 1> : dft_stage<T>
{
- fft_specialization(size_t) {}
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
protected:
constexpr static bool aligned = false;
@@ -905,7 +1064,7 @@ protected:
template <typename T>
struct fft_specialization<T, 2> : dft_stage<T>
{
- fft_specialization(size_t) {}
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
protected:
constexpr static bool aligned = false;
@@ -923,7 +1082,7 @@ protected:
template <typename T>
struct fft_specialization<T, 3> : dft_stage<T>
{
- fft_specialization(size_t) {}
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
protected:
constexpr static bool aligned = false;
@@ -940,7 +1099,7 @@ protected:
template <typename T>
struct fft_specialization<T, 4> : dft_stage<T>
{
- fft_specialization(size_t) {}
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
protected:
constexpr static bool aligned = false;
@@ -957,7 +1116,7 @@ protected:
template <typename T>
struct fft_specialization<T, 5> : dft_stage<T>
{
- fft_specialization(size_t) {}
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
protected:
constexpr static bool aligned = false;
@@ -974,7 +1133,7 @@ protected:
template <typename T>
struct fft_specialization<T, 6> : dft_stage<T>
{
- fft_specialization(size_t) {}
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
protected:
constexpr static bool aligned = false;
@@ -991,6 +1150,7 @@ struct fft_specialization<T, 7> : dft_stage<T>
{
fft_specialization(size_t)
{
+ this->name = type_name<decltype(*this)>();
this->stage_size = 128;
this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
}
@@ -1018,7 +1178,8 @@ protected:
{
const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
final_pass<inverse>(csize_t<final_size>(), out, in, twiddle);
- fft_reorder(out, csize_t<7>());
+ if (this->need_reorder)
+ fft_reorder(out, csize_t<7>());
}
template <bool inverse>
@@ -1045,7 +1206,11 @@ protected:
template <>
struct fft_specialization<float, 8> : dft_stage<float>
{
- fft_specialization(size_t) { this->temp_size = sizeof(complex<float>) * 256; }
+ fft_specialization(size_t)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->temp_size = sizeof(complex<float>) * 256;
+ }
protected:
using T = float;
@@ -1085,42 +1250,54 @@ template <>
struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
{
using T = double;
- using fft_final_stage_impl<double, false, 256>::fft_final_stage_impl;
+ fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ }
DFT_STAGE_FN
template <bool inverse>
KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
- fft_reorder(out, csize_t<8>());
+ if (this->need_reorder)
+ fft_reorder(out, csize_t<8>());
}
};
template <typename T>
struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
{
- using fft_final_stage_impl<T, false, 512>::fft_final_stage_impl;
+ fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ }
DFT_STAGE_FN
template <bool inverse>
KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
- fft_reorder(out, csize_t<9>());
+ if (this->need_reorder)
+ fft_reorder(out, csize_t<9>());
}
};
template <typename T>
struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
{
- using fft_final_stage_impl<T, false, 1024>::fft_final_stage_impl;
+ fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ }
DFT_STAGE_FN
template <bool inverse>
KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
- fft_reorder(out, 10, cfalse);
+ if (this->need_reorder)
+ fft_reorder(out, 10, cfalse);
}
};
@@ -1133,7 +1310,7 @@ template <typename Stage, typename... Args>
void dft_plan<T>::add_stage(Args... args)
{
dft_stage<T>* stage = new Stage(args...);
- stage->name = nullptr;
+ stage->need_reorder = need_reorder;
this->data_size += stage->data_size;
this->temp_size += stage->temp_size;
stages.push_back(dft_stage_ptr(stage));
@@ -1207,12 +1384,13 @@ void dft_plan<T>::initialize()
offset += stage->data_size;
}
- bool to_scratch = false;
+ bool to_scratch = false;
+ bool scratch_needed = false;
for (dft_stage_ptr& stage : reversed(stages))
{
if (to_scratch)
{
- this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
+ scratch_needed = true;
}
stage->to_scratch = to_scratch;
if (!stage->can_inplace)
@@ -1220,14 +1398,16 @@ void dft_plan<T>::initialize()
to_scratch = !to_scratch;
}
}
+ if (scratch_needed || !stages[0]->can_inplace)
+ this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
}
template <typename T>
const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
- const complex<T>* scratch) const
+ const complex<T>* scratch, bool in_scratch) const
{
if (stage == 0)
- return in;
+ return in_scratch ? scratch : in;
return stages[stage - 1]->to_scratch ? scratch : out;
}
@@ -1241,12 +1421,22 @@ template <typename T>
template <bool inverse>
void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
{
+ if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+ {
+ return stages[0]->execute(cbool<inverse>, out, in, temp);
+ }
size_t stack[32] = { 0 };
complex<T>* scratch =
ptr_cast<complex<T>>(temp + this->temp_size -
align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
+ bool in_scratch = !stages[0]->can_inplace && in == out;
+ if (in_scratch)
+ {
+ internal::builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
+ }
+
const size_t count = stages.size();
for (size_t depth = 0; depth < count;)
@@ -1266,7 +1456,7 @@ void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T
else
{
complex<T>* rout = select_out(rdepth, out, scratch);
- const complex<T>* rin = select_in(rdepth, out, in, scratch);
+ const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
offset += stages[rdepth]->out_offset;
stack[rdepth]++;
@@ -1281,15 +1471,16 @@ void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T
else
{
stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
- select_in(depth, out, in, scratch), temp);
+ select_in(depth, out, in, scratch, in_scratch), temp);
depth++;
}
}
}
template <typename T>
-dft_plan<T>::dft_plan(size_t size, dft_type) : size(size), temp_size(0), data_size(0)
+dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
{
+ need_reorder = true;
if (is_poweroftwo(size))
{
const size_t log2n = ilog2(size);
@@ -1303,62 +1494,82 @@ dft_plan<T>::dft_plan(size_t size, dft_type) : size(size), temp_size(0), data_si
cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
this->make_fft(size, is_even, ctrue);
constexpr size_t is_evenv = val_of(decltype(is_even)());
- this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size);
+ if (need_reorder)
+ this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size);
});
});
}
+#ifndef KFR_DFT_NO_NPo2
else
{
- size_t cur_size = size;
- constexpr size_t radices_count = dft_radices.back() + 1;
- u8 count[radices_count] = { 0 };
- int radices[32] = { 0 };
- size_t radices_size = 0;
+ if (size == 60)
+ {
+ this->add_stage<internal::dft_special_stage_impl<T, 6, 10>>();
+ }
+ else if (size == 48)
+ {
+ this->add_stage<internal::dft_special_stage_impl<T, 6, 8>>();
+ }
+ else
+ {
+ size_t cur_size = size;
+ constexpr size_t radices_count = dft_radices.back() + 1;
+ u8 count[radices_count] = { 0 };
+ int radices[32] = { 0 };
+ size_t radices_size = 0;
+
+ cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
+ while (cur_size && cur_size % val_of(radix) == 0)
+ {
+ count[val_of(radix)]++;
+ cur_size /= val_of(radix);
+ }
+ });
- cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
- while (cur_size && cur_size % val_of(radix) == 0)
+ if (cur_size >= 101)
{
- count[val_of(radix)]++;
- cur_size /= val_of(radix);
+ this->add_stage<internal::dft_arblen_stage_impl<T>>(size);
}
- });
+ else
+ {
+ size_t blocks = 1;
+ size_t iterations = size;
- size_t blocks = 1;
- size_t iterations = size;
+ for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
+ {
+ for (size_t i = 0; i < count[r]; i++)
+ {
+ iterations /= r;
+ radices[radices_size++] = r;
+ if (iterations == 1)
+ this->prepare_dft_stage(r, iterations, blocks, ctrue);
+ else
+ this->prepare_dft_stage(r, iterations, blocks, cfalse);
+ blocks *= r;
+ }
+ }
- for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
- {
- for (size_t i = 0; i < count[r]; i++)
- {
- iterations /= r;
- radices[radices_size++] = r;
- if (iterations == 1)
- this->prepare_dft_stage(r, iterations, blocks, ctrue);
- else
- this->prepare_dft_stage(r, iterations, blocks, cfalse);
- blocks *= r;
- }
- }
+ if (cur_size > 1)
+ {
+ iterations /= cur_size;
+ radices[radices_size++] = cur_size;
+ if (iterations == 1)
+ this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+ else
+ this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+ }
- if (cur_size > 1)
- {
- iterations /= cur_size;
- radices[radices_size++] = cur_size;
- if (iterations == 1)
- this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
- else
- this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+ if (stages.size() > 2)
+ this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size);
+ }
}
-
- if (stages.size() > 2)
- this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size);
}
+#endif
initialize();
}
template <typename T>
-dft_plan_real<T>::dft_plan_real(size_t size, dft_type type)
- : dft_plan<T>(size / 2, type), size(size), rtwiddle(size / 4)
+dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
{
using namespace internal;
@@ -1463,6 +1674,15 @@ dft_plan<T>::~dft_plan()
{
}
+template <typename T>
+void dft_plan<T>::dump() const
+{
+ for (const dft_stage_ptr& s : stages)
+ {
+ s->dump();
+ }
+}
+
namespace internal
{
@@ -1621,24 +1841,26 @@ template void convolve_filter<double>::set_data(const univector<double>&);
template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
-template dft_plan<float>::dft_plan(size_t, dft_type);
+template dft_plan<float>::dft_plan(size_t, dft_order);
template dft_plan<float>::~dft_plan();
+template void dft_plan<float>::dump() const;
template void dft_plan<float>::execute_dft(cometa::cbool_t<false>, kfr::complex<float>* out,
const kfr::complex<float>* in, kfr::u8* temp) const;
template void dft_plan<float>::execute_dft(cometa::cbool_t<true>, kfr::complex<float>* out,
const kfr::complex<float>* in, kfr::u8* temp) const;
-template dft_plan_real<float>::dft_plan_real(size_t, dft_type);
+template dft_plan_real<float>::dft_plan_real(size_t);
template void dft_plan_real<float>::from_fmt(kfr::complex<float>* out, const kfr::complex<float>* in,
kfr::dft_pack_format fmt) const;
template void dft_plan_real<float>::to_fmt(kfr::complex<float>* out, kfr::dft_pack_format fmt) const;
-template dft_plan<double>::dft_plan(size_t, dft_type);
+template dft_plan<double>::dft_plan(size_t, dft_order);
template dft_plan<double>::~dft_plan();
+template void dft_plan<double>::dump() const;
template void dft_plan<double>::execute_dft(cometa::cbool_t<false>, kfr::complex<double>* out,
const kfr::complex<double>* in, kfr::u8* temp) const;
template void dft_plan<double>::execute_dft(cometa::cbool_t<true>, kfr::complex<double>* out,
const kfr::complex<double>* in, kfr::u8* temp) const;
-template dft_plan_real<double>::dft_plan_real(size_t, dft_type);
+template dft_plan_real<double>::dft_plan_real(size_t);
template void dft_plan_real<double>::from_fmt(kfr::complex<double>* out, const kfr::complex<double>* in,
kfr::dft_pack_format fmt) const;
template void dft_plan_real<double>::to_fmt(kfr::complex<double>* out, kfr::dft_pack_format fmt) const;
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -54,6 +54,12 @@ enum class dft_type
inverse
};
+enum class dft_order
+{
+ normal,
+ internal, // possibly bit/digit-reversed, implementation-defined, faster
+};
+
template <typename T>
struct dft_stage;
@@ -65,7 +71,9 @@ struct dft_plan
size_t size;
size_t temp_size;
- dft_plan(size_t size, dft_type = dft_type::both);
+ dft_plan(size_t size, dft_order order = dft_order::normal);
+
+ void dump() const;
KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const
{
@@ -101,6 +109,7 @@ protected:
autofree<u8> data;
size_t data_size;
std::vector<dft_stage_ptr> stages;
+ bool need_reorder;
template <typename Stage, typename... Args>
void add_stage(Args... args);
@@ -115,7 +124,8 @@ protected:
template <bool inverse>
void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const;
- const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in, const complex<T>* scratch) const;
+ const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
+ const complex<T>* scratch, bool in_scratch) const;
complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const;
};
@@ -129,7 +139,7 @@ template <typename T>
struct dft_plan_real : dft_plan<T>
{
size_t size;
- dft_plan_real(size_t size, dft_type = dft_type::both);
+ dft_plan_real(size_t size);
void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete;
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -542,8 +542,10 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
template <size_t N, typename T>
KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
{
- w0 = a0 + a1;
- w1 = a0 - a1;
+ const cvec<T, N> sum = a0 + a1;
+ const cvec<T, N> dif = a0 - a1;
+ w0 = sum;
+ w1 = dif;
}
template <size_t N, typename T>
@@ -1016,21 +1018,25 @@ KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1)
a1 = cmul(a1, tw1);
}
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw3r1 = static_cast<T>(-0.5 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw3i1 =
+ static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
+
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
- cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02)
+KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01,
+ cvec<T, N>& w02)
{
- const static cvec<T, N> tw3r1 = static_cast<T>(-0.5 - 1.0);
- const static cvec<T, N> tw3i1 =
- static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
const cvec<T, N> sum1 = a01 + a02;
const cvec<T, N> dif1 = swap<2>(a01 - a02);
w00 = a00 + sum1;
- const cvec<T, N> s1 = w00 + sum1 * tw3r1;
+ const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>;
- const cvec<T, N> d1 = dif1 * tw3i1;
+ const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>;
w01 = s1 + d1;
w02 = s1 - d1;
@@ -1073,22 +1079,80 @@ KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<
butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5);
}
+template <typename T, bool inverse = false>
+const static cvec<T, 1> tw9_1 = { T(0.76604444311897803520239265055541),
+ (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) };
+template <typename T, bool inverse = false>
+const static cvec<T, 1> tw9_2 = { T(0.17364817766693034885171662676931),
+ (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) };
+template <typename T, bool inverse = false>
+const static cvec<T, 1> tw9_4 = { T(-0.93969262078590838405410927732473),
+ (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) };
+
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
- const cvec<T, N>& a03, const cvec<T, N>& a04, const cvec<T, N>& a05,
- const cvec<T, N>& a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
- cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
+KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+ const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+ const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, cvec<T, N>& w0,
+ cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5,
+ cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
+{
+ cvec<T, N* 3> a012 = concat(a0, a1, a2);
+ cvec<T, N* 3> a345 = concat(a3, a4, a5);
+ cvec<T, N* 3> a678 = concat(a6, a7, a8);
+ butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678);
+ cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8;
+ split(a012, t0, t1, t2);
+ split(a345, t3, t4, t5);
+ split(a678, t6, t7, t8);
+
+ t4 = cmul(t4, tw9_1<T, inverse>);
+ t5 = cmul(t5, tw9_2<T, inverse>);
+ t7 = cmul(t7, tw9_2<T, inverse>);
+ t8 = cmul(t8, tw9_4<T, inverse>);
+
+ cvec<T, N* 3> t036 = concat(t0, t3, t6);
+ cvec<T, N* 3> t147 = concat(t1, t4, t7);
+ cvec<T, N* 3> t258 = concat(t2, t5, t8);
+
+ butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258);
+ split(t036, w0, w1, w2);
+ split(t147, w3, w4, w5);
+ split(t258, w6, w7, w8);
+}
+
+template <size_t N, bool inverse = false, typename T>
+KFR_INTRIN void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+ cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
{
- const static cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884 - 1.0);
- const static cvec<T, N> tw7i1 =
- static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>();
- const static cvec<T, N> tw7r2 = static_cast<T>(-0.2225209339563144042889025645 - 1.0);
- const static cvec<T, N> tw7i2 =
- static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>();
- const static cvec<T, N> tw7r3 = static_cast<T>(-0.90096886790241912623610231951 - 1.0);
- const static cvec<T, N> tw7i3 =
- static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
+ butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8);
+}
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7r1 = static_cast<T>(0.623489801858733530525004884 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7i1 =
+ static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7r2 = static_cast<T>(-0.2225209339563144042889025645 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7i2 =
+ static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7r3 = static_cast<T>(-0.90096886790241912623610231951 - 1.0);
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw7i3 =
+ static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
+
+template <size_t N, bool inverse = false, typename T>
+KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+ cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+ cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
+{
const cvec<T, N> sum1 = a01 + a06;
const cvec<T, N> dif1 = swap<2>(a01 - a06);
const cvec<T, N> sum2 = a02 + a05;
@@ -1097,13 +1161,19 @@ KFR_INTRIN void butterfly7(const cvec<T, N>& a00, const cvec<T, N>& a01, const c
const cvec<T, N> dif3 = swap<2>(a03 - a04);
w00 = a00 + sum1 + sum2 + sum3;
- const cvec<T, N> s1 = w00 + sum1 * tw7r1 + sum2 * tw7r2 + sum3 * tw7r3;
- const cvec<T, N> s2 = w00 + sum1 * tw7r2 + sum2 * tw7r3 + sum3 * tw7r1;
- const cvec<T, N> s3 = w00 + sum1 * tw7r3 + sum2 * tw7r1 + sum3 * tw7r2;
+ const cvec<T, N> s1 =
+ w00 + sum1 * tw7r1<T, N, inverse> + sum2 * tw7r2<T, N, inverse> + sum3 * tw7r3<T, N, inverse>;
+ const cvec<T, N> s2 =
+ w00 + sum1 * tw7r2<T, N, inverse> + sum2 * tw7r3<T, N, inverse> + sum3 * tw7r1<T, N, inverse>;
+ const cvec<T, N> s3 =
+ w00 + sum1 * tw7r3<T, N, inverse> + sum2 * tw7r1<T, N, inverse> + sum3 * tw7r2<T, N, inverse>;
- const cvec<T, N> d1 = dif1 * tw7i1 + dif2 * tw7i2 + dif3 * tw7i3;
- const cvec<T, N> d2 = dif1 * tw7i2 - dif2 * tw7i3 - dif3 * tw7i1;
- const cvec<T, N> d3 = dif1 * tw7i3 - dif2 * tw7i1 + dif3 * tw7i2;
+ const cvec<T, N> d1 =
+ dif1 * tw7i1<T, N, inverse> + dif2 * tw7i2<T, N, inverse> + dif3 * tw7i3<T, N, inverse>;
+ const cvec<T, N> d2 =
+ dif1 * tw7i2<T, N, inverse> - dif2 * tw7i3<T, N, inverse> - dif3 * tw7i1<T, N, inverse>;
+ const cvec<T, N> d3 =
+ dif1 * tw7i3<T, N, inverse> - dif2 * tw7i1<T, N, inverse> + dif3 * tw7i2<T, N, inverse>;
w01 = s1 + d1;
w06 = s1 - d1;
@@ -1120,29 +1190,131 @@ KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<
butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6);
}
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i1 =
+ static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i2 =
+ static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i3 =
+ static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i4 =
+ static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>();
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0);
+
+template <typename T, size_t N, bool inverse>
+static const cvec<T, N> tw11i5 =
+ static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>();
+
+template <size_t N, bool inverse = false, typename T>
+KFR_INTRIN void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+ cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
+ cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+ cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
+ cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
+{
+ const cvec<T, N> sum1 = a01 + a10;
+ const cvec<T, N> dif1 = swap<2>(a01 - a10);
+ const cvec<T, N> sum2 = a02 + a09;
+ const cvec<T, N> dif2 = swap<2>(a02 - a09);
+ const cvec<T, N> sum3 = a03 + a08;
+ const cvec<T, N> dif3 = swap<2>(a03 - a08);
+ const cvec<T, N> sum4 = a04 + a07;
+ const cvec<T, N> dif4 = swap<2>(a04 - a07);
+ const cvec<T, N> sum5 = a05 + a06;
+ const cvec<T, N> dif5 = swap<2>(a05 - a06);
+ w00 = a00 + sum1 + sum2 + sum3 + sum4 + sum5;
+
+ const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> +
+ sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> +
+ sum5 * tw11r5<T, N, inverse>;
+ const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> +
+ sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> +
+ sum5 * tw11r1<T, N, inverse>;
+ const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> +
+ sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> +
+ sum5 * tw11r2<T, N, inverse>;
+ const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> +
+ sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> +
+ sum5 * tw11r3<T, N, inverse>;
+ const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> +
+ sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> +
+ sum5 * tw11r4<T, N, inverse>;
+
+ const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> +
+ dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> +
+ dif5 * tw11i5<T, N, inverse>;
+ const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> -
+ dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> -
+ dif5 * tw11i1<T, N, inverse>;
+ const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> +
+ dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> +
+ dif5 * tw11i2<T, N, inverse>;
+ const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> +
+ dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> -
+ dif5 * tw11i3<T, N, inverse>;
+ const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> +
+ dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> +
+ dif5 * tw11i4<T, N, inverse>;
+
+ w01 = s1 + d1;
+ w10 = s1 - d1;
+ w02 = s2 + d2;
+ w09 = s2 - d2;
+ w03 = s3 + d3;
+ w08 = s3 - d3;
+ w04 = s4 + d4;
+ w07 = s4 - d4;
+ w05 = s5 + d5;
+ w06 = s5 - d5;
+}
+
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718 - 1.0);
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5i1 =
+ static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>();
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5r2 = static_cast<T>(-0.80901699437494742410229341718 - 1.0);
+template <typename T, size_t N, bool inverse>
+const static cvec<T, N> tw5i2 =
+ static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
+
template <size_t N, bool inverse = false, typename T>
KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
{
- const static cvec<T, N> tw5r1 = static_cast<T>(0.30901699437494742410229341718 - 1.0);
- const static cvec<T, N> tw5i1 =
- static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>();
- const static cvec<T, N> tw5r2 = static_cast<T>(-0.80901699437494742410229341718 - 1.0);
- const static cvec<T, N> tw5i2 =
- static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
-
const cvec<T, N> sum1 = a01 + a04;
const cvec<T, N> dif1 = swap<2>(a01 - a04);
const cvec<T, N> sum2 = a02 + a03;
const cvec<T, N> dif2 = swap<2>(a02 - a03);
w00 = a00 + sum1 + sum2;
- const cvec<T, N> s1 = w00 + sum1 * tw5r1 + sum2 * tw5r2;
- const cvec<T, N> s2 = w00 + sum1 * tw5r2 + sum2 * tw5r1;
+ const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse> + sum2 * tw5r2<T, N, inverse>;
+ const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse> + sum2 * tw5r1<T, N, inverse>;
- const cvec<T, N> d1 = dif1 * tw5i1 + dif2 * tw5i2;
- const cvec<T, N> d2 = dif1 * tw5i2 - dif2 * tw5i1;
+ const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse> + dif2 * tw5i2<T, N, inverse>;
+ const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse> - dif2 * tw5i1<T, N, inverse>;
w01 = s1 + d1;
w04 = s1 - d1;
@@ -1245,6 +1417,16 @@ KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N
template <bool inverse, typename T, size_t N>
KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+ const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, vec<T, N>& out0,
+ vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
+ vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8)
+{
+ butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4,
+ out5, out6, out7, out8);
+}
+template <bool inverse, typename T, size_t N>
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+ const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
@@ -1253,6 +1435,17 @@ KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N
butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3,
out4, out5, out6, out7, out8, out9);
}
+template <bool inverse, typename T, size_t N>
+KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
+ const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
+ const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
+ const vec<T, N>& in9, const vec<T, N>& in10, vec<T, N>& out0, vec<T, N>& out1,
+ vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
+ vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
+{
+ butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2,
+ out3, out4, out5, out6, out7, out8, out9, out10);
+}
template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
{
@@ -1360,15 +1553,17 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...);
}
-template <typename T, bool inverse, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, size_t, cbool_t<inverse>, complex<T>*, const complex<T>*,
- Tstride, size_t, size_t, const complex<T>*, size_t)
+template <typename T, bool inverse, typename Tradix, typename Tstride>
+KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, Tradix radix, cbool_t<inverse>, complex<T>*,
+ const complex<T>*, Tstride, size_t, size_t, const complex<T>*, size_t)
{
}
-template <size_t width, bool inverse, typename T, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<inverse>, complex<T>* out,
- const complex<T>* in, Tstride ostride, size_t halfradix,
- size_t halfradix_sqr, const complex<T>* twiddle, size_t i)
+
+template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix,
+ typename Thalfradixsqr, typename Tstride>
+KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
+ const complex<T>* in, Tstride ostride, Thalfradix halfradix,
+ Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
{
CMT_LOOP_NOUNROLL
for (; i < halfradix / width * width; i += width)
@@ -1377,7 +1572,6 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, size_t radix, cbool_t<in
cvec<T, width> sum0 = resize<2 * width>(in0);
cvec<T, width> sum1 = sum0;
- CMT_LOOP_NOUNROLL
for (size_t j = 0; j < halfradix; j++)
{
const cvec<T, 1> ina = cread<1>(in + (1 + j));
@@ -1442,6 +1636,36 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>*
}
cwrite<1>(out, hcadd(sum) + sums);
}
+ const auto halfradix = radix / 2;
+ const auto halfradix_sqr = halfradix * halfradix;
+ CMT_ASSUME(halfradix > 0);
+ size_t i = 0;
+
+ generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
+ halfradix * halfradix, twiddle, i);
+}
+
+template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>>
+KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
+ const complex<T>* in, const complex<T>* twiddle,
+ Tstride ostride = Tstride{})
+{
+ {
+ cvec<T, width> sum = T();
+ size_t j = 0;
+ CMT_LOOP_UNROLL
+ for (; j < radix / width * width; j += width)
+ {
+ sum += cread<width>(in + j);
+ }
+ cvec<T, 1> sums = T();
+ CMT_LOOP_UNROLL
+ for (; j < radix; j++)
+ {
+ sums += cread<1>(in + j);
+ }
+ cwrite<1>(out, hcadd(sum) + sums);
+ }
const size_t halfradix = radix / 2;
const size_t halfradix_sqr = halfradix * halfradix;
CMT_ASSUME(halfradix > 0);
@@ -1455,21 +1679,15 @@ template <typename T, bool inverse, typename Tstride = csize_t<1>>
KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
complex<T>* temp, const complex<T>* twiddle, Tstride ostride = Tstride{})
{
- if (out == in)
- {
- builtin_memcpy(temp, in, sizeof(complex<T>) * radix);
- in = temp;
- }
constexpr size_t width = platform<T>::vector_width;
- generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
- /*cswitch(csizes_t<11>(), radix,
+ cswitch(csizes_t<11, 13>(), radix,
[&](auto radix_) CMT_INLINE_LAMBDA {
- generic_butterfly_w<width>(decltype(radix_)(), cbool_t<inverse>(), out, in, twiddle, ostride);
+ spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride);
},
[&]() CMT_INLINE_LAMBDA {
generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
- });*/
+ });
}
template <typename T, size_t N>
diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp
@@ -191,13 +191,23 @@ struct test_case
check(result, as_string(comparison.left), expr);
}
+ void append_comment(const std::string& text)
+ {
+ comment += text;
+ if (show_progress)
+ {
+ println();
+ println(text, ":");
+ }
+ }
+
void set_comment(const std::string& text)
{
comment = text;
if (show_progress)
{
println();
- println(comment, ":");
+ println(text, ":");
}
}
@@ -344,6 +354,6 @@ void assert_is_same_decay()
#define TEST TESTO_TEST
#define DTEST TESTO_DTEST
#endif
-}
+} // namespace testo
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -10,6 +10,7 @@
#include <kfr/dft.hpp>
#include <kfr/dsp.hpp>
#include <kfr/io.hpp>
+#include <set>
using namespace kfr;
@@ -70,56 +71,71 @@ TEST(fft_accuracy)
{
testo::active_test()->show_progress = true;
random_bit_generator gen(2247448713, 915890490, 864203735, 2982561);
-
- testo::matrix(named("type") = dft_float_types, //
- named("inverse") = std::make_tuple(false, true), //
- named("log2(size)") = make_range(size_t(1), stopsize), //
- [&gen](auto type, bool inverse, size_t log2size) {
- using float_type = type_of<decltype(type)>;
- const size_t size = 1 << log2size;
-
- {
- univector<complex<float_type>> in =
- truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
- univector<complex<float_type>> out = in;
- univector<complex<float_type>> refout = out;
- const dft_plan<float_type> dft(size);
- univector<u8> temp(dft.temp_size);
-
- reference_dft(refout.data(), in.data(), size, inverse);
- dft.execute(out, out, temp, inverse);
-
- const float_type rms_diff = rms(cabs(refout - out));
- const double ops = log2size * 100;
- const double epsilon = std::numeric_limits<float_type>::epsilon();
- CHECK(rms_diff < epsilon * ops);
- }
-
- if (size >= 16)
- {
- univector<float_type> in =
- truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
-
- univector<complex<float_type>> out = truncate(scalar(qnan), size);
- univector<complex<float_type>> refout = truncate(scalar(qnan), size);
- const dft_plan_real<float_type> dft(size);
- univector<u8> temp(dft.temp_size);
-
- reference_fft(refout.data(), in.data(), size);
- dft.execute(out, in, temp);
- const float_type rms_diff_r =
- rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
- const double ops = log2size * 200;
- const double epsilon = std::numeric_limits<float_type>::epsilon();
- CHECK(rms_diff_r < epsilon * ops);
-
- univector<float_type> out2(size, 0.f);
- dft.execute(out2, out, temp);
- out2 = out2 / size;
- const float_type rms_diff_r2 = rms(in - out2);
- CHECK(rms_diff_r2 < epsilon * ops);
- }
- });
+ std::set<size_t> size_set;
+ univector<size_t> sizes = truncate(1 + counter(), stopsize - 1);
+ sizes = round(pow(2.0, sizes));
+
+#ifndef KFR_DFT_NO_NPo2
+ univector<size_t> sizes2 = truncate(2 + counter(), 1024);
+ for (size_t s : sizes2)
+ {
+ if (std::find(sizes.begin(), sizes.end(), s) == sizes.end())
+ sizes.push_back(s);
+ }
+#endif
+ println(sizes);
+
+ testo::matrix(
+ named("type") = dft_float_types, //
+ named("size") = sizes, //
+ [&gen](auto type, size_t size) {
+ using float_type = type_of<decltype(type)>;
+ const double min_prec = 0.000001 * std::log(size) * size;
+
+ for (bool inverse : { false, true })
+ {
+ testo::active_test()->append_comment(inverse ? "complex-inverse" : "complex-direct");
+ univector<complex<float_type>> in =
+ truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+ univector<complex<float_type>> out = in;
+ univector<complex<float_type>> refout = out;
+ univector<complex<float_type>> outo = in;
+ const dft_plan<float_type> dft(size);
+ univector<u8> temp(dft.temp_size);
+
+ reference_dft(refout.data(), in.data(), size, inverse);
+ dft.execute(outo, in, temp, inverse);
+ dft.execute(out, out, temp, inverse);
+
+ const float_type rms_diff_inplace = rms(cabs(refout - out));
+ CHECK(rms_diff_inplace < min_prec);
+ const float_type rms_diff_outofplace = rms(cabs(refout - outo));
+ CHECK(rms_diff_outofplace < min_prec);
+ }
+
+ if (size >= 4 && is_poweroftwo(size))
+ {
+ univector<float_type> in = truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+
+ univector<complex<float_type>> out = truncate(scalar(qnan), size);
+ univector<complex<float_type>> refout = truncate(scalar(qnan), size);
+ const dft_plan_real<float_type> dft(size);
+ univector<u8> temp(dft.temp_size);
+
+ testo::active_test()->append_comment("real-direct");
+ reference_fft(refout.data(), in.data(), size);
+ dft.execute(out, in, temp);
+ float_type rms_diff = rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
+ CHECK(rms_diff < min_prec);
+
+ univector<float_type> out2(size, 0.f);
+ testo::active_test()->append_comment("real-inverse");
+ dft.execute(out2, out, temp);
+ out2 = out2 / size;
+ rms_diff = rms(in - out2);
+ CHECK(rms_diff < min_prec);
+ }
+ });
}
#ifndef KFR_NO_MAIN
@@ -127,6 +143,6 @@ int main()
{
println(library_version(), " running on ", cpu_runtime());
- return testo::run_all("", true);
+ return testo::run_all("", false);
}
#endif