commit a5251827c16beec42b386b5e78d0663ae37639d8
parent 6feabe6fffbd44c16a643159e0dc87bcc8f2282f
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Fri, 10 Nov 2023 08:44:48 +0000
DFT performance improvements
Diffstat:
10 files changed, 1250 insertions(+), 312 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -163,8 +163,8 @@ if (APPLE)
endif ()
if (NOT IOS)
if (NOT MSVC OR CLANG)
- target_compile_options(kfr
- INTERFACE "${CLANG_ARG_PREFIX}-mstackrealign")
+ # target_compile_options(kfr
+ # INTERFACE "${CLANG_ARG_PREFIX}-mstackrealign")
endif ()
endif ()
if (MSVC)
@@ -221,7 +221,7 @@ endfunction ()
if (KFR_ENABLE_DFT)
- set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast")
+ set(KFR_DFT_DEFS "${CLANG_ARG_PREFIX}-ffp-contract=fast -Xclang -O3 -mllvm -x86-use-vzeroupper=0")
if (KFR_ENABLE_DFT_MULTIARCH)
add_library(kfr_dft INTERFACE)
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -34,6 +34,7 @@
#include "../simd/constants.hpp"
#include "../simd/read_write.hpp"
#include "../simd/vec.hpp"
+#include <bitset>
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wshadow")
@@ -49,6 +50,8 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
namespace kfr
{
+#define DFT_MAX_STAGES 32
+
using cdirect_t = cfalse_t;
using cinvert_t = ctrue_t;
@@ -67,17 +70,14 @@ struct dft_stage
const char* name = nullptr;
bool recursion = false;
bool can_inplace = true;
- bool inplace = false;
- bool to_scratch = false;
bool need_reorder = true;
void initialize(size_t size) { do_initialize(size); }
virtual void dump() const
{
- printf("%s: %zu, %zu, %zu, %zu, %zu, %zu, %zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
- stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
- to_scratch);
+ printf("%s: %zu, %zu, %zu, %zu, %zu, %zu, %zu, %d, %d\n", name ? name : "unnamed", radix, stage_size,
+ data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace);
}
KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
@@ -173,7 +173,7 @@ struct dft_plan
void dump() const
{
- for (const std::unique_ptr<dft_stage<T>>& s : stages)
+ for (const std::unique_ptr<dft_stage<T>>& s : all_stages)
{
s->dump();
}
@@ -213,8 +213,89 @@ struct dft_plan
autofree<u8> data;
size_t data_size;
- std::vector<dft_stage_ptr<T>> stages;
+
+ std::vector<dft_stage_ptr<T>> all_stages;
+ std::array<std::vector<dft_stage<T>*>, 2> stages;
bool arblen;
+ using bitset = std::bitset<DFT_MAX_STAGES>;
+ std::array<bitset, 2> disposition_inplace;
+ std::array<bitset, 2> disposition_outofplace;
+
+ void calc_disposition()
+ {
+ for (bool inverse : { false, true })
+ {
+ auto&& stages = this->stages[inverse];
+ bitset can_inplace_per_stage;
+ for (int i = 0; i < stages.size(); ++i)
+ {
+ can_inplace_per_stage[i] = stages[i]->can_inplace;
+ }
+
+ disposition_inplace[static_cast<int>(inverse)] =
+ precompute_disposition(stages.size(), can_inplace_per_stage, true);
+ disposition_outofplace[static_cast<int>(inverse)] =
+ precompute_disposition(stages.size(), can_inplace_per_stage, false);
+ }
+ }
+
+ static bitset precompute_disposition(int num_stages, bitset can_inplace_per_stage, bool inplace_requested)
+ {
+ static bitset even{ 0x5555555555555555ull };
+ bitset mask = ~bitset() >> (DFT_MAX_STAGES - num_stages);
+ bitset result;
+ // disposition indicates where is input for corresponding stage
+ // first bit : 0 - input, 1 - scratch
+ // other bits: 0 - output, 1 - scratch
+
+ // build disposition that works always
+ if (num_stages % 2 == 0)
+ { // even
+ result = ~even & mask;
+ }
+ else
+ { // odd
+ result = even & mask;
+ }
+
+ int num_inplace = can_inplace_per_stage.count();
+
+#ifdef KFR_DFT_ELIMINATE_MEMCPY
+ if (num_inplace > 0 && inplace_requested)
+ {
+ if (result.test(0)) // input is in scratch
+ {
+ // num_inplace must be odd
+ if (num_inplace % 2 == 0)
+ --num_inplace;
+ }
+ else
+ {
+ // num_inplace must be even
+ if (num_inplace % 2 != 0)
+ --num_inplace;
+ }
+ }
+#endif
+ if (num_inplace > 0)
+ {
+ for (int i = num_stages - 1; i >= 0; --i)
+ {
+ if (can_inplace_per_stage.test(i))
+ {
+ result ^= ~bitset() >> (DFT_MAX_STAGES - (i + 1));
+
+ if (--num_inplace == 0)
+ break;
+ }
+ }
+ }
+
+ if (!inplace_requested) // out-of-place first stage; IN->OUT
+ result.reset(0);
+
+ return result;
+ }
protected:
struct noinit
@@ -224,32 +305,35 @@ protected:
: size(size), temp_size(0), data_size(0), arblen(false)
{
}
- const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
- const complex<T>* scratch, bool in_scratch) const
+ const complex<T>* select_in(bitset disposition, size_t stage, const complex<T>* out, const complex<T>* in,
+ const complex<T>* scratch) const
{
- if (stage == 0)
- return in_scratch ? scratch : in;
- return stages[stage - 1]->to_scratch ? scratch : out;
+ return disposition.test(stage) ? scratch : stage == 0 ? in : out;
}
- complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
+ complex<T>* select_out(bitset disposition, size_t stage, size_t total_stages, complex<T>* out,
+ complex<T>* scratch) const
{
- return stages[stage]->to_scratch ? scratch : out;
+ return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out;
}
template <bool inverse>
void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
{
+ auto&& stages = this->stages[inverse];
if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
{
return stages[0]->execute(cbool<inverse>, out, in, temp);
}
- size_t stack[32] = { 0 };
+ size_t stack[DFT_MAX_STAGES] = { 0 };
+
+ bitset disposition =
+ in == out ? this->disposition_inplace[inverse] : this->disposition_outofplace[inverse];
complex<T>* scratch = ptr_cast<complex<T>>(
temp + this->temp_size -
align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
- bool in_scratch = !stages[0]->can_inplace && in == out;
+ bool in_scratch = disposition.test(0);
if (in_scratch)
{
builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
@@ -273,8 +357,8 @@ protected:
}
else
{
- complex<T>* rout = select_out(rdepth, out, scratch);
- const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
+ complex<T>* rout = select_out(disposition, rdepth, stages.size(), out, scratch);
+ const complex<T>* rin = select_in(disposition, rdepth, out, in, scratch);
stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
offset += stages[rdepth]->out_offset;
stack[rdepth]++;
@@ -291,8 +375,9 @@ protected:
size_t offset = 0;
while (offset < this->size)
{
- stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch) + offset,
- select_in(depth, out, in, scratch, in_scratch) + offset, temp);
+ stages[depth]->execute(
+ cbool<inverse>, select_out(disposition, depth, stages.size(), out, scratch) + offset,
+ select_in(disposition, depth, out, in, scratch) + offset, temp);
offset += stages[depth]->stage_size;
}
depth++;
@@ -306,7 +391,6 @@ struct dft_plan_real : dft_plan<T>
{
size_t size;
dft_pack_format fmt;
- dft_stage_ptr<T> fmt_stage;
explicit dft_plan_real(cpu_t cpu, size_t size, dft_pack_format fmt = dft_pack_format::CCs)
: dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt)
@@ -360,13 +444,10 @@ struct dft_plan_real : dft_plan<T>
KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp, cdirect_t = {}) const
{
this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp);
- fmt_stage->execute(cfalse, out, out, nullptr);
}
KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp, cinvert_t = {}) const
{
- complex<T>* outdata = ptr_cast<complex<T>>(out);
- fmt_stage->execute(ctrue, outdata, in, nullptr);
- this->execute_dft(ctrue, outdata, outdata, temp);
+ this->execute_dft(ctrue, ptr_cast<complex<T>>(out), in, temp);
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
@@ -374,15 +455,12 @@ struct dft_plan_real : dft_plan<T>
univector<u8, Tag3>& temp, cdirect_t = {}) const
{
this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data());
- fmt_stage->execute(cfalse, out.data(), out.data(), nullptr);
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
univector<u8, Tag3>& temp, cinvert_t = {}) const
{
- complex<T>* outdata = ptr_cast<complex<T>>(out.data());
- fmt_stage->execute(ctrue, outdata, in.data(), nullptr);
- this->execute_dft(ctrue, outdata, outdata, temp.data());
+ this->execute_dft(ctrue, ptr_cast<complex<T>>(out.data()), in.data(), temp.data());
}
// Deprecated. fmt must be passed to constructor instead
@@ -434,7 +512,7 @@ struct dct_plan : dft_plan<T>
}
else
{
- mirrored = make_complex(make_univector(in, size) * cos(t), make_univector(in, size) * -sin(t));
+ mirrored = make_complex(make_univector(in, size) * cos(t), make_univector(in, size) * -sin(t));
mirrored[0] = mirrored[0] * T(0.5);
dft_plan<T>::execute(mirrored_dft.data(), mirrored.data(), temp, cfalse);
for (size_t i = 0; i < halfSize; i++)
diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp
@@ -58,7 +58,12 @@ CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x)
CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits)
{
if (bits > bitrev_table_log2N)
- return bitreverse<32>(x) >> (32 - bits);
+ {
+ if (bits <= 16)
+ return bitreverse<16>(x) >> (16 - bits);
+ else
+ return bitreverse<32>(x) >> (32 - bits);
+ }
return data::bitrev_table[x] >> (bitrev_table_log2N - bits);
}
@@ -66,7 +71,12 @@ CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits)
CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits)
{
if (bits > bitrev_table_log2N)
- return digitreverse4<32>(x) >> (32 - bits);
+ {
+ if (bits <= 16)
+ return digitreverse4<16>(x) >> (16 - bits);
+ else
+ return digitreverse4<32>(x) >> (32 - bits);
+ }
x = data::bitrev_table[x];
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
@@ -218,7 +228,7 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>)
}
template <typename T>
-KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */)
{
constexpr size_t bitrev = 4;
fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4);
@@ -232,6 +242,20 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>)
}
template <typename T>
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */)
+{
+ constexpr size_t bitrev = 2;
+ fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4);
+ fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4);
+ fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4);
+ fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4);
+ fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4);
+ fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4);
+ fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4);
+ fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4);
+}
+
+template <typename T>
KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
{
constexpr size_t bitrev = 2;
@@ -253,6 +277,44 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4);
}
+template <typename T>
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */)
+{
+ constexpr size_t bitrev = 2;
+ fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4);
+ fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4);
+ fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4);
+ fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4);
+ fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4);
+}
+
template <typename T, bool use_br2>
KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
{
@@ -285,22 +347,35 @@ KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
{
size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
if (i >= j)
+ {
fft_reorder_swap_n4(io, i, j, N4, use_br2);
+ }
+ else
+ {
+ i += 4 * istep;
+ continue;
+ }
i += istep;
j = j + jstep1;
if (i >= j)
+ {
fft_reorder_swap_n4(io, i, j, N4, use_br2);
+ }
i += istep;
j = j - jstep2;
if (i >= j)
+ {
fft_reorder_swap_n4(io, i, j, N4, use_br2);
+ }
i += istep;
j = j + jstep1;
if (i >= j)
+ {
fft_reorder_swap_n4(io, i, j, N4, use_br2);
+ }
i += istep;
}
}
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -94,14 +94,19 @@ CMT_PRAGMA_GNU(GCC diagnostic push)
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
#endif
-template <typename Stage, typename T, typename... Args>
-void add_stage(dft_plan<T>* self, Args... args)
+template <typename Stage, bool add_stages = true, typename T, typename... Args>
+void add_stage(dft_plan<T>* plan, Args... args)
{
dft_stage<T>* stage = new Stage(args...);
stage->need_reorder = true;
- self->data_size += stage->data_size;
- self->temp_size += stage->temp_size;
- self->stages.push_back(dft_stage_ptr<T>(stage));
+ plan->data_size += stage->data_size;
+ plan->temp_size += stage->temp_size;
+ plan->all_stages.push_back(dft_stage_ptr<T>(stage));
+ if constexpr (add_stages)
+ {
+ plan->stages[0].push_back(stage);
+ plan->stages[1].push_back(stage);
+ }
}
} // namespace CMT_ARCH_NAME
diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp
@@ -515,9 +515,11 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order)
}
});
+ int num_stages = 0;
if (cur_size >= 101)
{
add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size);
+ ++num_stages;
self->arblen = true;
}
else
@@ -535,6 +537,7 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order)
prepare_dft_stage(self, r, iterations, blocks, ctrue);
else
prepare_dft_stage(self, r, iterations, blocks, cfalse);
+ ++num_stages;
blocks *= r;
}
}
@@ -547,9 +550,10 @@ void init_dft(dft_plan<T>* self, size_t size, dft_order)
prepare_dft_stage(self, cur_size, iterations, blocks, ctrue);
else
prepare_dft_stage(self, cur_size, iterations, blocks, cfalse);
+ ++num_stages;
}
- if (self->stages.size() > 2)
+ if (num_stages > 2)
add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size);
}
}
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -46,6 +46,57 @@ namespace kfr
inline namespace CMT_ARCH_NAME
{
+constexpr bool inline always_br2 = true;
+constexpr bool inline use_autosort = false;
+
+#define KFR_AUTOSORT_FOR_128D
+#define KFR_AUTOSORT_FOR_256D
+#define KFR_AUTOSORT_FOR_512
+#define KFR_AUTOSORT_FOR_1024
+
+#ifdef CMT_ARCH_AVX
+template <>
+KFR_INTRINSIC vec<float, 32> ctranspose<4, float, 32>(const vec<float, 32>& v16)
+{
+ cvec<float, 4> r0, r1, r2, r3;
+ split(v16, r0, r1, r2, r3);
+ const __m256d t0 = _mm256_unpacklo_pd(r0.v, r1.v);
+ const __m256d t1 = _mm256_unpacklo_pd(r2.v, r3.v);
+ const __m256d t2 = _mm256_unpackhi_pd(r0.v, r1.v);
+ const __m256d t3 = _mm256_unpackhi_pd(r2.v, r3.v);
+ r0.v = _mm256_permute2f128_pd(t0, t1, 0x20);
+ r1.v = _mm256_permute2f128_pd(t2, t3, 0x20);
+ r2.v = _mm256_permute2f128_pd(t0, t1, 0x31);
+ r3.v = _mm256_permute2f128_pd(t2, t3, 0x31);
+ return concat(r0, r1, r2, r3);
+}
+#endif
+
+template <>
+KFR_INTRINSIC vec<float, 64> ctranspose<8, float, 64>(const vec<float, 64>& v32)
+{
+ cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7;
+ split(v32, a0, a1, a2, a3, a4, a5, a6, a7);
+ cvec<float, 16> even = concat(a0, a2, a4, a6);
+ cvec<float, 16> odd = concat(a1, a3, a5, a7);
+ even = ctranspose<4>(even);
+ odd = ctranspose<4>(odd);
+ return concat(even, odd);
+}
+
+template <>
+KFR_INTRINSIC vec<float, 64> ctranspose<4, float, 64>(const vec<float, 64>& v32)
+{
+ cvec<float, 16> lo, hi;
+ split(v32, lo, hi);
+ lo = ctranspose<4>(lo);
+ hi = ctranspose<4>(hi);
+ cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7;
+ split(lo, a0, a1, a2, a3);
+ split(hi, a4, a5, a6, a7);
+ return concat(a0, a4, a1, a5, a2, a6, a3, a7);
+}
+
namespace intrinsics
{
@@ -59,8 +110,13 @@ KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*spl
ww = swap<2>(ww);
if constexpr (inverse)
- tw_ = -(tw_);
- ww = subadd(b1, ww * dupodd(tw_));
+ {
+ ww = addsub(b1, ww * dupodd(tw_));
+ }
+ else
+ {
+ ww = subadd(b1, ww * dupodd(tw_));
+ }
return ww;
}
@@ -134,18 +190,18 @@ KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splito
{
const size_t N4 = N / 4;
cvec<T, width> w1, w2, w3;
- constexpr bool read_split = !splitin && splitout;
- constexpr bool write_split = splitin && !splitout;
+ constexpr bool read_split = !splitin;
+ constexpr bool write_split = !splitout;
vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
- split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
- split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
-
const vec<T, width> sum02re = re0 + re2;
const vec<T, width> sum02im = im0 + im2;
+
+ split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
+ split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
const vec<T, width> sum13re = re1 + re3;
const vec<T, width> sum13im = im1 + im3;
@@ -254,19 +310,38 @@ CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, s
#endif
#endif
-template <typename T>
+template <size_t size = 1, typename T>
KFR_INTRINSIC void prefetch_one(const complex<T>* in)
{
KFR_PREFETCH(in);
+ if constexpr (sizeof(complex<T>) * size > 64)
+ KFR_PREFETCH(in + 64);
+ if constexpr (sizeof(complex<T>) * size > 128)
+ KFR_PREFETCH(in + 128);
+ if constexpr (sizeof(complex<T>) * size > 192)
+ KFR_PREFETCH(in + 192);
}
-template <typename T>
+template <size_t size = 1, typename T>
KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in)
{
- KFR_PREFETCH(in);
- KFR_PREFETCH(in + stride);
- KFR_PREFETCH(in + stride * 2);
- KFR_PREFETCH(in + stride * 3);
+ prefetch_one<size>(in);
+ prefetch_one<size>(in + stride);
+ prefetch_one<size>(in + stride * 2);
+ prefetch_one<size>(in + stride * 3);
+}
+
+template <size_t size = 1, typename T>
+KFR_INTRINSIC void prefetch_eight(size_t stride, const complex<T>* in)
+{
+ prefetch_one<size>(in);
+ prefetch_one<size>(in + stride);
+ prefetch_one<size>(in + stride * 2);
+ prefetch_one<size>(in + stride * 3);
+ prefetch_one<size>(in + stride * 4);
+ prefetch_one<size>(in + stride * 5);
+ prefetch_one<size>(in + stride * 6);
+ prefetch_one<size>(in + stride * 7);
}
template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
@@ -277,7 +352,7 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool
const complex<T>*& twiddle)
{
static_assert(width > 0, "width cannot be zero");
- constexpr static size_t prefetch_offset = width * 8;
+ constexpr static size_t prefetch_cycles = 8;
const auto N4 = N / csize_t<4>();
const auto N43 = N4 * csize_t<3>();
CMT_ASSUME(blocks > 0);
@@ -286,16 +361,17 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool
DFT_ASSERT(width <= N4);
CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
{
- CMT_PRAGMA_CLANG(clang loop unroll_count(2))
- for (size_t n2 = 0; n2 < N4; n2 += width)
+ CMT_LOOP_NOUNROLL
+ for (size_t n2 = 0; n2 < N4;)
{
if constexpr (prefetch)
- prefetch_four(N4, in + prefetch_offset);
+ prefetch_four<width>(N4, in + width * prefetch_cycles);
radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
in, twiddle + n2 * 3);
in += width;
out += width;
+ n2 += width;
}
in += N43;
out += N43;
@@ -304,41 +380,314 @@ KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool
return {};
}
-template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>,
- cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
- complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, bool inverse, typename T>
+KFR_INTRINSIC void radix2_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>,
+ cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>,
+ cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+ const complex<T>*& twiddle)
+
{
- CMT_ASSUME(blocks > 0);
- constexpr static size_t prefetch_offset = 32 * 4;
- for (size_t b = 0; b < blocks; b++)
+ for (size_t n = 0; n < stride; n++)
+ {
+ const cvec<T, 1> a = cread<1>(in + n);
+ const cvec<T, 1> b = cread<1>(in + n + stride);
+ cwrite<1>(out + n, a + b);
+ cwrite<1>(out + n + stride, a - b);
+ }
+}
+
+template <size_t N, bool inverse, bool split_format, typename T>
+KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3,
+ cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+{
+ if constexpr (split_format)
+ {
+ const cvec<T, N> sum02 = a0 + a2;
+ const cvec<T, N> diff02 = a0 - a2;
+ const cvec<T, N> sum13 = a1 + a3;
+ cvec<T, N> diff13 = a1 - a3;
+ vec<T, N> diff13re, diff13im, diff02re, diff02im;
+ split(diff02, diff02re, diff02im);
+ split(diff13, diff13re, diff13im);
+ w0 = sum02 + sum13;
+ w2 = sum02 - sum13;
+
+ (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
+ (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
+ }
+ else
+ {
+ const cvec<T, N> sum02 = a0 + a2;
+ const cvec<T, N> diff02 = a0 - a2;
+ const cvec<T, N> sum13 = a1 + a3;
+ cvec<T, N> diff13 = swap<2>(a1 - a3);
+ if constexpr (inverse)
+ diff13 = negodd(diff13);
+ else
+ diff13 = negeven(diff13);
+ w0 = sum02 + sum13;
+ w2 = sum02 - sum13;
+ w1 = diff02 - diff13;
+ w3 = diff02 + diff13;
+ }
+}
+
+template <size_t N, bool inverse, bool split_format, typename T>
+KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3,
+ cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3,
+ cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3)
+{
+ if constexpr (split_format)
+ {
+ const cvec<T, N> sum02 = a0 + a2;
+ const cvec<T, N> diff02 = a0 - a2;
+ const cvec<T, N> sum13 = a1 + a3;
+ cvec<T, N> diff13 = a1 - a3;
+ vec<T, N> diff13re, diff13im, diff02re, diff02im;
+ split(diff02, diff02re, diff02im);
+ split(diff13, diff13re, diff13im);
+
+ w0 = sum02 + sum13;
+ w2 = sum02 - sum13;
+ (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
+ (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
+
+ w2 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w2, tw2);
+ w1 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w1, tw1);
+ w3 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w3, tw3);
+ }
+ else
+ {
+ const cvec<T, N> sum02 = a0 + a2;
+ const cvec<T, N> diff02 = a0 - a2;
+ const cvec<T, N> sum13 = a1 + a3;
+ cvec<T, N> diff13 = swap<2>(a1 - a3);
+ if constexpr (inverse)
+ diff13 = negodd(diff13);
+ else
+ diff13 = negeven(diff13);
+
+ w0 = sum02 + sum13;
+ w2 = sum02 - sum13;
+ w1 = diff02 - diff13;
+ w3 = diff02 + diff13;
+ w2 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w2, tw2);
+ w1 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w1, tw1);
+ w3 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w3, tw3);
+ }
+}
+
+template <size_t N, bool split_format, typename T>
+KFR_INTRINSIC cvec<T, N> read_twiddle(const complex<T>* tw)
+{
+ if constexpr (split_format)
+ {
+ return concat(repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw))),
+ repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw) + 1)));
+ }
+ else
+ {
+ return repeat<N>(cread<1>(tw));
+ }
+}
+
+template <size_t width_, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix4_autosort_pass_first(size_t N, csize_t<width_>, cbool_t<splitout>, cbool_t<splitin>,
+ cbool_t<prefetch>, cbool_t<inverse>, complex<T>* out,
+ const complex<T>* in, const complex<T>*& twiddle)
+{
+ static_assert(width_ > 0, "width cannot be zero");
+ const size_t N4 = N / 4;
+ const size_t Nstride = N4;
+ constexpr size_t width = width_;
+ constexpr bool split = splitin || splitout;
+ static_assert(!split);
+ constexpr static size_t prefetch_cycles = 8;
+
+ // CMT_LOOP_NOUNROLL
+ for (size_t b = 0; b < N4; b += width)
{
if constexpr (prefetch)
- prefetch_four(csize_t<64>(), out + prefetch_offset);
- cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
- split(cread_split<8, aligned, splitin>(out + 0), w0, w1);
- split(cread_split<8, aligned, splitin>(out + 8), w2, w3);
- split(cread_split<8, aligned, splitin>(out + 16), w4, w5);
- split(cread_split<8, aligned, splitin>(out + 24), w6, w7);
-
- butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
-
- w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
- w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
- w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
- w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
- w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
- w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
- w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
-
- cvec<T, 8> z0, z1, z2, z3;
- transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
-
- butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
- cwrite<32, aligned>(out, bitreverse<2>(concat(concat(z0, z1), concat(z2, z3))));
- out += 32;
+ prefetch_four<width>(Nstride, in + prefetch_cycles * width);
+ cvec<T, width> tw1 = cread<width>(twiddle);
+ cvec<T, width> tw2 = cread<width>(twiddle + width);
+ cvec<T, width> tw3 = cread<width>(twiddle + 2 * width);
+
+ const cvec<T, width> a0 = cread<width>(in + 0 * Nstride);
+ const cvec<T, width> a1 = cread<width>(in + 1 * Nstride);
+ const cvec<T, width> a2 = cread<width>(in + 2 * Nstride);
+ const cvec<T, width> a3 = cread<width>(in + 3 * Nstride);
+ cvec<T, width> w0, w1, w2, w3;
+ radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3);
+ cvec<T, 4 * width> w0123 = concat(w0, w1, w2, w3);
+ w0123 = ctranspose<width>(w0123);
+ cwrite<4 * width>(out, w0123);
+ twiddle += 3 * width;
+ in += width;
+ out += 4 * width;
+ }
+}
+
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix4_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>,
+ cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
+ complex<T>* out, const complex<T>* in,
+ const complex<T>*& twiddle)
+{
+ static_assert(width > 0, "width cannot be zero");
+ constexpr static size_t prefetch_cycles = 8;
+ constexpr bool split = splitin || splitout;
+ constexpr bool read_split = !splitin && split;
+ constexpr bool write_split = !splitout && split;
+ static_assert(!splitout);
+
+ CMT_PRAGMA_CLANG(clang loop unroll_count(4))
+ for (size_t n = 0; n < stride; n += width)
+ {
+ if constexpr (prefetch)
+ prefetch_four<width>(stride, in + prefetch_cycles * width);
+ const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride);
+ const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride);
+ const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride);
+ const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride);
+ cvec<T, width> w0, w1, w2, w3;
+ radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3);
+ cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+ cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+ cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+ cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+ in += width;
+ out += width;
}
- return {};
+}
+
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix8_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>,
+ cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
+ complex<T>* out, const complex<T>* in,
+ const complex<T>*& twiddle)
+{
+ static_assert(width > 0, "width cannot be zero");
+ constexpr static size_t prefetch_cycles = 4;
+ constexpr bool split = splitin || splitout;
+ constexpr bool read_split = !splitin && split;
+ constexpr bool write_split = !splitout && split;
+ static_assert(!splitout);
+
+ CMT_PRAGMA_CLANG(clang loop unroll_count(4))
+ for (size_t n = 0; n < stride; n += width)
+ {
+ if constexpr (prefetch)
+ prefetch_eight<width>(stride, in + prefetch_cycles * width);
+ const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride);
+ const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride);
+ const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride);
+ const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride);
+ const cvec<T, width> a4 = cread_split<width, false, read_split>(in + 4 * stride);
+ const cvec<T, width> a5 = cread_split<width, false, read_split>(in + 5 * stride);
+ const cvec<T, width> a6 = cread_split<width, false, read_split>(in + 6 * stride);
+ const cvec<T, width> a7 = cread_split<width, false, read_split>(in + 7 * stride);
+ cvec<T, width> w0, w1, w2, w3, w4, w5, w6, w7;
+ butterfly8<width, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, w0, w1, w2, w3, w4, w5, w6, w7);
+ cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+ cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+ cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+ cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+ cwrite_split<width, false, write_split>(out + 4 * stride, w4);
+ cwrite_split<width, false, write_split>(out + 5 * stride, w5);
+ cwrite_split<width, false, write_split>(out + 6 * stride, w6);
+ cwrite_split<width, false, write_split>(out + 7 * stride, w7);
+ in += width;
+ out += width;
+ }
+}
+
+template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
+KFR_INTRINSIC void radix4_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>,
+ cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
+ complex<T>* out, const complex<T>* in, const complex<T>*& twiddle)
+{
+ static_assert(width > 0, "width cannot be zero");
+ constexpr static size_t prefetch_cycles = 8;
+ const size_t N4 = N / 4;
+ const size_t Nstride = stride * N4;
+ const size_t stride3 = 3 * stride;
+ constexpr bool split = splitin || splitout;
+ constexpr bool read_split = !splitin && split;
+ constexpr bool write_split = !splitout && split;
+
+ {
+ for (size_t n = 0; n < stride; n += width)
+ {
+ if constexpr (prefetch)
+ prefetch_four<width>(Nstride, in + prefetch_cycles * width);
+ const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride);
+ const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride);
+ const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride);
+ const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride);
+ cvec<T, width> w0, w1, w2, w3;
+ radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3);
+ cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+ cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+ cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+ cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+ in += width;
+ out += width;
+ }
+ twiddle += 3;
+ out += stride3;
+ }
+
+ // CMT_LOOP_NOUNROLL
+ for (size_t b = 1; b < N4; b++)
+ {
+ cvec<T, width> tw1 = read_twiddle<width, split>(twiddle);
+ cvec<T, width> tw2 = read_twiddle<width, split>(twiddle + 1);
+ cvec<T, width> tw3 = read_twiddle<width, split>(twiddle + 2);
+ for (size_t n = 0; n < stride; n += width)
+ {
+ if constexpr (prefetch)
+ prefetch_four<width>(Nstride, in + prefetch_cycles * width);
+ const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride);
+ const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride);
+ const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride);
+ const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride);
+ cvec<T, width> w0, w1, w2, w3;
+ radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3);
+ cwrite_split<width, false, write_split>(out + 0 * stride, w0);
+ cwrite_split<width, false, write_split>(out + 1 * stride, w1);
+ cwrite_split<width, false, write_split>(out + 2 * stride, w2);
+ cwrite_split<width, false, write_split>(out + 3 * stride, w3);
+ in += width;
+ out += width;
+ }
+ twiddle += 3;
+ out += stride3;
+ }
+}
+
+template <typename T>
+static void initialize_twiddle_autosort(size_t N, size_t w, complex<T>*& twiddle)
+{
+ for (size_t b = 0; b < N / 4; ++b)
+ {
+ cwrite<1>(twiddle + b / w * 3 * w + b % w + 0 * w, calculate_twiddle<T>(b, N));
+ cwrite<1>(twiddle + b / w * 3 * w + b % w + 1 * w, calculate_twiddle<T>(2 * b, N));
+ cwrite<1>(twiddle + b / w * 3 * w + b % w + 2 * w, calculate_twiddle<T>(3 * b, N));
+ }
+ twiddle += N / 4 * 3;
+}
+
+template <typename T>
+KFR_INTRINSIC void interleavehalves32(cvec<T, 32>& v0)
+{
+ cvec<T, 8> t0, t1, t2, t3;
+ split(v0, t0, t1, t2, t3);
+ t0 = interleavehalves(t0);
+ t1 = interleavehalves(t1);
+ t2 = interleavehalves(t2);
+ t3 = interleavehalves(t3);
+ v0 = concat(t0, t1, t2, t3);
}
template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
@@ -347,22 +696,47 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfa
complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
{
CMT_ASSUME(blocks > 0);
- DFT_ASSERT(2 <= blocks);
- constexpr static size_t prefetch_offset = width * 16;
- for (size_t b = 0; b < blocks; b += 2)
+ DFT_ASSERT(4 <= blocks);
+ constexpr static size_t prefetch_cycles = 8;
+ if constexpr (vector_capacity<T> >= 128)
{
- if constexpr (prefetch)
- prefetch_one(out + prefetch_offset);
-
- cvec<T, 8> vlo = cread<8, aligned>(out + 0);
- cvec<T, 8> vhi = cread<8, aligned>(out + 8);
- butterfly8<inverse>(vlo);
- butterfly8<inverse>(vhi);
- vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo);
- vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi);
- cwrite<8, aligned>(out, vlo);
- cwrite<8, aligned>(out + 8, vhi);
- out += 16;
+ CMT_PRAGMA_CLANG(clang loop unroll(disable))
+ for (size_t b = 0; b < blocks; b += 4)
+ {
+ if constexpr (prefetch)
+ prefetch_one<32>(out + prefetch_cycles * 32);
+
+ cvec<T, 32> v32 = cread<32, aligned>(out);
+ cvec<T, 4> v0, v1, v2, v3, v4, v5, v6, v7;
+ v32 = ctranspose<8>(v32);
+ split(v32, v0, v1, v2, v3, v4, v5, v6, v7);
+ butterfly8<4, inverse>(v0, v1, v2, v3, v4, v5, v6, v7);
+ v32 = concat(v0, v4, v2, v6, v1, v5, v3, v7);
+ v32 = ctranspose<4>(v32);
+ cwrite<32, aligned>(out, v32);
+
+ out += 32;
+ }
+ }
+ else
+ {
+ CMT_PRAGMA_CLANG(clang loop unroll(disable))
+ for (size_t b = 0; b < blocks; b += 2)
+ {
+ if constexpr (prefetch)
+ prefetch_one<16>(out + prefetch_cycles * 16);
+
+ cvec<T, 16> v16 = cread<16, aligned>(out);
+ cvec<T, 2> v0, v1, v2, v3, v4, v5, v6, v7;
+ v16 = ctranspose<8>(v16);
+ split(v16, v0, v1, v2, v3, v4, v5, v6, v7);
+ butterfly8<2, inverse>(v0, v1, v2, v3, v4, v5, v6, v7);
+ v16 = concat(v0, v4, v2, v6, v1, v5, v3, v7);
+ v16 = ctranspose<2>(v16);
+ cwrite<16, aligned>(out, v16);
+
+ out += 16;
+ }
}
return {};
}
@@ -373,28 +747,67 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cf
complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
{
CMT_ASSUME(blocks > 0);
- constexpr static size_t prefetch_offset = width * 4;
- DFT_ASSERT(2 <= blocks);
- CMT_PRAGMA_CLANG(clang loop unroll_count(2))
- for (size_t b = 0; b < blocks; b += 2)
+ constexpr static size_t prefetch_cycles = 4;
+ DFT_ASSERT(4 <= blocks);
+ if constexpr (vector_capacity<T> >= 128)
{
- if constexpr (prefetch)
- prefetch_one(out + prefetch_offset);
-
- cvec<T, 16> vlo = cread<16, aligned>(out);
- cvec<T, 16> vhi = cread<16, aligned>(out + 16);
- butterfly4<4, inverse>(vlo);
- butterfly4<4, inverse>(vhi);
- apply_twiddles4<0, 4, 4, inverse>(vlo);
- apply_twiddles4<0, 4, 4, inverse>(vhi);
- vlo = digitreverse4<2>(vlo);
- vhi = digitreverse4<2>(vhi);
- butterfly4<4, inverse>(vlo);
- butterfly4<4, inverse>(vhi);
-
- use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
- use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
- out += 32;
+ CMT_PRAGMA_CLANG(clang loop unroll(disable))
+ for (size_t b = 0; b < blocks; b += 4)
+ {
+ if constexpr (prefetch)
+ prefetch_one<64>(out + prefetch_cycles * 64);
+
+ cvec<T, 16> v0 = cread<16, aligned>(out);
+ cvec<T, 16> v1 = cread<16, aligned>(out + 16);
+ cvec<T, 16> v2 = cread<16, aligned>(out + 32);
+ cvec<T, 16> v3 = cread<16, aligned>(out + 48);
+ butterfly4_packed<4, inverse>(v0);
+ butterfly4_packed<4, inverse>(v1);
+ butterfly4_packed<4, inverse>(v2);
+ butterfly4_packed<4, inverse>(v3);
+ apply_twiddles4<0, 4, 4, inverse>(v0);
+ apply_twiddles4<0, 4, 4, inverse>(v1);
+ apply_twiddles4<0, 4, 4, inverse>(v2);
+ apply_twiddles4<0, 4, 4, inverse>(v3);
+ v0 = digitreverse4<2>(v0);
+ v1 = digitreverse4<2>(v1);
+ v2 = digitreverse4<2>(v2);
+ v3 = digitreverse4<2>(v3);
+ butterfly4_packed<4, inverse>(v0);
+ butterfly4_packed<4, inverse>(v1);
+ butterfly4_packed<4, inverse>(v2);
+ butterfly4_packed<4, inverse>(v3);
+
+ use_br2 ? cbitreverse_write(out, v0) : cdigitreverse4_write(out, v0);
+ use_br2 ? cbitreverse_write(out + 16, v1) : cdigitreverse4_write(out + 16, v1);
+ use_br2 ? cbitreverse_write(out + 32, v2) : cdigitreverse4_write(out + 32, v2);
+ use_br2 ? cbitreverse_write(out + 48, v3) : cdigitreverse4_write(out + 48, v3);
+ out += 64;
+ }
+ }
+ else
+ {
+ CMT_PRAGMA_CLANG(clang loop unroll(disable))
+ for (size_t b = 0; b < blocks; b += 2)
+ {
+ if constexpr (prefetch)
+ prefetch_one<32>(out + prefetch_cycles * 32);
+
+ cvec<T, 16> vlo = cread<16, aligned>(out);
+ cvec<T, 16> vhi = cread<16, aligned>(out + 16);
+ butterfly4_packed<4, inverse>(vlo);
+ butterfly4_packed<4, inverse>(vhi);
+ apply_twiddles4<0, 4, 4, inverse>(vlo);
+ apply_twiddles4<0, 4, 4, inverse>(vhi);
+ vlo = digitreverse4<2>(vlo);
+ vhi = digitreverse4<2>(vhi);
+ butterfly4_packed<4, inverse>(vlo);
+ butterfly4_packed<4, inverse>(vhi);
+
+ use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
+ use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
+ out += 32;
+ }
}
return {};
}
@@ -404,16 +817,18 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfa
cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
{
- constexpr static size_t prefetch_offset = width * 4;
+ constexpr static size_t prefetch_cycles = 8;
CMT_ASSUME(blocks > 8);
DFT_ASSERT(8 <= blocks);
for (size_t b = 0; b < blocks; b += 4)
{
if constexpr (prefetch)
- prefetch_one(out + prefetch_offset);
+ prefetch_one<16>(out + prefetch_cycles * 16);
cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
- butterfly4<4, inverse>(v16);
+ butterfly4_packed<4, inverse>(v16);
+ if constexpr (use_br2)
+ v16 = permutegroups<(8), 0, 2, 1, 3>(v16);
cdigitreverse4_write<aligned>(out, v16);
out += 4 * 4;
@@ -447,6 +862,7 @@ struct fft_stage_impl : dft_stage<T>
constexpr static bool prefetch = fft_config<T>::prefetch;
constexpr static bool aligned = false;
constexpr static size_t width = fft_config<T>::process_width;
+ constexpr static bool use_br2 = !is_even || always_br2;
virtual void do_initialize(size_t size) override final
{
@@ -464,7 +880,7 @@ struct fft_stage_impl : dft_stage<T>
const size_t stg_size = this->stage_size;
CMT_ASSUME(stg_size >= 2048);
CMT_ASSUME(stg_size % 2048 == 0);
- radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
+ radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
}
};
@@ -485,12 +901,18 @@ struct fft_final_stage_impl : dft_stage<T>
constexpr static size_t width = fft_config<T>::process_width;
constexpr static bool is_even = cometa::is_even(ilog2(size));
- constexpr static bool use_br2 = !is_even;
+ constexpr static bool use_br2 = !is_even || always_br2;
constexpr static bool aligned = false;
constexpr static bool prefetch = fft_config<T>::prefetch && splitin;
- KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
- KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
+ template <bool pass_splitin>
+ KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cbool_t<pass_splitin>, complex<T>*&)
+ {
+ }
+ template <bool pass_splitin>
+ KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cbool_t<pass_splitin>, complex<T>*&)
+ {
+ }
static constexpr bool get_pass_splitout(size_t N) { return N / 4 > 8 && N / 4 / 4 >= width; }
@@ -518,33 +940,25 @@ struct fft_final_stage_impl : dft_stage<T>
final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
}
- template <bool inverse, typename U = T, KFR_ENABLE_IF(std::is_same_v<U, float>)>
- KFR_MEM_INTRINSIC void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
- {
- radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-
- template <bool inverse, typename U = T, KFR_ENABLE_IF(std::is_same_v<U, float>)>
- KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
+ template <bool inverse, bool pass_splitin, typename U = T, KFR_ENABLE_IF(vector_capacity<U> >= 128)>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+ const complex<T>*, const complex<T>*& twiddle)
{
radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
}
- template <bool inverse>
- KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
+ template <bool inverse, bool pass_splitin>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+ const complex<T>*, const complex<T>*& twiddle)
{
radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
}
- template <bool inverse>
- KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
+ template <bool inverse, bool pass_splitin>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+ const complex<T>*, const complex<T>*& twiddle)
{
radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
@@ -583,7 +997,77 @@ struct fft_reorder_stage_impl : dft_stage<T>
template <bool inverse>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
{
- fft_reorder(out, this->user, cbool_t<!is_even>());
+ fft_reorder(out, this->user, cbool_t<(!is_even || always_br2)>());
+ }
+};
+
+template <typename T, bool is_first, bool is_last, bool radix8>
+struct fft_autosort_stage_impl : dft_stage<T>
+{
+ fft_autosort_stage_impl(size_t stage_size, size_t stride)
+ {
+ this->name = dft_name(this);
+ this->radix = radix8 ? 8 : 4;
+ this->stage_size = stage_size * stride * this->radix;
+ this->blocks = stage_size;
+ this->recursion = false;
+ this->can_inplace = is_last;
+ this->need_reorder = false;
+ this->user = stride;
+ if constexpr (!is_last)
+ {
+ this->data_size =
+ align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
+ }
+ }
+
+ constexpr static bool prefetch = fft_config<T>::prefetch;
+ constexpr static bool aligned = false;
+
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t total_size) final
+ {
+ if constexpr (!is_last)
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ if constexpr (is_first)
+ initialize_twiddle_autosort(this->blocks, width, twiddle);
+ else
+ initialize_twiddle_autosort(this->blocks, 1, twiddle);
+ }
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ const size_t stg_size = this->blocks;
+ const size_t stride = this->user;
+ if constexpr (is_first)
+ {
+ radix4_autosort_pass_first(stg_size, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
+ cbool_t<inverse>(), out, in, twiddle);
+ }
+ else if constexpr (is_last)
+ {
+ if constexpr (radix8)
+ radix8_autosort_pass_last(stride, csize_t<width / 2>(), cfalse, cfalse, cbool_t<prefetch>(),
+ cbool_t<inverse>(), out, in, twiddle);
+ else
+ radix4_autosort_pass_last(stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
+ cbool_t<inverse>(), out, in, twiddle);
+ }
+ else
+ {
+ if (stride == 4)
+ radix4_autosort_pass(stg_size, stride, csize_t<4>(), cfalse, cfalse, cbool_t<prefetch>(),
+ cbool_t<inverse>(), out, in, twiddle);
+ else
+ radix4_autosort_pass(stg_size, stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
+ cbool_t<inverse>(), out, in, twiddle);
+ }
}
};
@@ -593,7 +1077,11 @@ struct fft_specialization;
template <typename T>
struct fft_specialization<T, 0> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 1;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -608,7 +1096,11 @@ struct fft_specialization<T, 0> : dft_stage<T>
template <typename T>
struct fft_specialization<T, 1> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 2;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -625,7 +1117,11 @@ struct fft_specialization<T, 1> : dft_stage<T>
template <typename T>
struct fft_specialization<T, 2> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 4;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -642,7 +1138,11 @@ struct fft_specialization<T, 2> : dft_stage<T>
template <typename T>
struct fft_specialization<T, 3> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 8;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -650,7 +1150,7 @@ struct fft_specialization<T, 3> : dft_stage<T>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
cvec<T, 8> v8 = cread<8, aligned>(in);
- butterfly8<inverse>(v8);
+ butterfly8_packed<inverse>(v8);
cwrite<8, aligned>(out, v8);
}
};
@@ -658,7 +1158,11 @@ struct fft_specialization<T, 3> : dft_stage<T>
template <typename T>
struct fft_specialization<T, 4> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 16;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -666,7 +1170,7 @@ struct fft_specialization<T, 4> : dft_stage<T>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
cvec<T, 16> v16 = cread<16, aligned>(in);
- butterfly16<inverse>(v16);
+ butterfly16_packed<inverse>(v16);
cwrite<16, aligned>(out, v16);
}
};
@@ -674,7 +1178,11 @@ struct fft_specialization<T, 4> : dft_stage<T>
template <typename T>
struct fft_specialization<T, 5> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 32;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -682,25 +1190,100 @@ struct fft_specialization<T, 5> : dft_stage<T>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
cvec<T, 32> v32 = cread<32, aligned>(in);
- butterfly32<inverse>(v32);
+ butterfly32_packed<inverse>(v32);
cwrite<32, aligned>(out, v32);
}
};
+#ifdef KFR_AUTOSORT_FOR_64
+template <typename T>
+struct fft_specialization<T, 6> : dft_stage<T>
+{
+ fft_specialization(size_t stage_size)
+ {
+ this->stage_size = 64;
+ this->name = dft_name(this);
+ this->temp_size = 64 * sizeof(complex<T>);
+ this->data_size = 64 * sizeof(complex<T>);
+ }
+
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t) final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddle_autosort(64, width, twiddle);
+ initialize_twiddle_autosort(16, 1, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ auto no = cfalse;
+ const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+ complex<T>* scratch = ptr_cast<complex<T>>(temp);
+ radix4_autosort_pass_first(64, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+ radix4_autosort_pass(16, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+ }
+};
+#else
template <typename T>
struct fft_specialization<T, 6> : dft_stage<T>
{
- fft_specialization(size_t) { this->name = dft_name(this); }
+ fft_specialization(size_t)
+ {
+ this->stage_size = 64;
+ this->name = dft_name(this);
+ }
constexpr static bool aligned = false;
DFT_STAGE_FN
template <bool inverse>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
- butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
+ butterfly64_memory(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
}
};
+#endif
+#ifdef KFR_AUTOSORT_FOR_128D
+template <>
+struct fft_specialization<double, 7> : dft_stage<double>
+{
+ using T = double;
+ fft_specialization(size_t stage_size)
+ {
+ this->stage_size = 128;
+ this->name = dft_name(this);
+ this->temp_size = 128 * sizeof(complex<T>);
+ this->data_size = 128 * sizeof(complex<T>);
+ }
+
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t) final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddle_autosort(128, width, twiddle);
+ initialize_twiddle_autosort(32, 1, twiddle);
+ initialize_twiddle_autosort(8, 1, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ auto no = cfalse;
+ const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+ complex<T>* scratch = ptr_cast<complex<T>>(temp);
+ radix4_autosort_pass_first(128, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+ radix4_autosort_pass(32, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix8_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+ }
+};
+#else
template <>
struct fft_specialization<double, 7> : dft_stage<double>
{
@@ -741,6 +1324,7 @@ struct fft_specialization<double, 7> : dft_stage<double>
fft_reorder(out, csize_t<7>());
}
};
+#endif
template <>
struct fft_specialization<float, 7> : dft_stage<float>
@@ -775,7 +1359,9 @@ struct fft_specialization<float, 7> : dft_stage<float>
const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
- radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ radix4_pass(32, 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+ cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
if (this->need_reorder)
fft_reorder(out, csize_t<7>());
@@ -787,8 +1373,9 @@ struct fft_specialization<float, 8> : dft_stage<float>
{
fft_specialization(size_t)
{
- this->name = dft_name(this);
- this->temp_size = sizeof(complex<float>) * 256;
+ this->stage_size = 256;
+ this->name = dft_name(this);
+ this->temp_size = sizeof(complex<float>) * 256;
}
using T = float;
@@ -824,13 +1411,62 @@ struct fft_specialization<float, 8> : dft_stage<float>
}
};
+#ifdef KFR_AUTOSORT_FOR_256D
+
+template <>
+struct fft_specialization<double, 8> : dft_stage<double>
+{
+ using T = double;
+ fft_specialization(size_t stage_size)
+ {
+ this->stage_size = 256;
+ this->name = dft_name(this);
+ this->temp_size = 256 * sizeof(complex<T>);
+ this->data_size = 256 * sizeof(complex<T>);
+ }
+
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t) final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddle_autosort(256, width, twiddle);
+ initialize_twiddle_autosort(64, 1, twiddle);
+ initialize_twiddle_autosort(16, 1, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ auto no = cfalse;
+ const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+ complex<T>* scratch = ptr_cast<complex<T>>(temp);
+ if (in != out)
+ {
+ radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, out, in, tw);
+ radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw);
+ radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+ }
+ else
+ {
+ radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+ radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw);
+ radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+ }
+ }
+};
+#else
template <>
struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
{
using T = double;
fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
{
- this->name = dft_name(this);
+ this->stage_size = 256;
+ this->name = dft_name(this);
}
DFT_STAGE_FN
@@ -839,16 +1475,56 @@ struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
{
fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
if (this->need_reorder)
- fft_reorder(out, csize_t<8>());
+ fft_reorder(out, csize_t<8>(), cbool<always_br2>);
}
};
+#endif
+
+#ifdef KFR_AUTOSORT_FOR_512
+
+template <typename T>
+struct fft_specialization<T, 9> : dft_stage<T>
+{
+ fft_specialization(size_t stage_size)
+ {
+ this->stage_size = 512;
+ this->name = dft_name(this);
+ this->temp_size = 512 * sizeof(complex<T>);
+ this->data_size = 512 * sizeof(complex<T>);
+ }
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t) final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddle_autosort(512, width, twiddle);
+ initialize_twiddle_autosort(128, 1, twiddle);
+ initialize_twiddle_autosort(32, 1, twiddle);
+ initialize_twiddle_autosort(8, 1, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ auto no = cfalse;
+ const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+ complex<T>* scratch = ptr_cast<complex<T>>(temp);
+ radix4_autosort_pass_first(512, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+ radix4_autosort_pass(128, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass(32, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw);
+ radix8_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+ }
+};
+#else
template <typename T>
struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
{
fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
{
- this->name = dft_name(this);
+ this->stage_size = 512;
+ this->name = dft_name(this);
}
DFT_STAGE_FN
@@ -860,7 +1536,47 @@ struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
fft_reorder(out, csize_t<9>());
}
};
+#endif
+#ifdef KFR_AUTOSORT_FOR_1024
+template <typename T>
+struct fft_specialization<T, 10> : dft_stage<T>
+{
+ fft_specialization(size_t stage_size)
+ {
+ this->stage_size = 1024;
+ this->name = dft_name(this);
+ this->temp_size = 1024 * sizeof(complex<T>);
+ this->data_size = 1024 * sizeof(complex<T>);
+ }
+
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t) final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddle_autosort(1024, width, twiddle);
+ initialize_twiddle_autosort(256, 1, twiddle);
+ initialize_twiddle_autosort(64, 1, twiddle);
+ initialize_twiddle_autosort(16, 1, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ auto no = cfalse;
+ auto split = cfalse;
+ const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+ complex<T>* scratch = ptr_cast<complex<T>>(temp);
+ radix4_autosort_pass_first(1024, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+ radix4_autosort_pass(256, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass(64, 16, csize<width>, split, no, no, cbool<inverse>, scratch, out, tw);
+ radix4_autosort_pass(16, 64, csize<width>, split, split, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass_last(256, csize<width>, no, split, no, cbool<inverse>, out, out, tw);
+ }
+};
+#else
template <typename T>
struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
{
@@ -878,23 +1594,86 @@ struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
fft_reorder(out, 10, cfalse);
}
};
+#endif
+
+#ifdef KFR_AUTOSORT_FOR_2048
+template <typename T>
+struct fft_specialization<T, 11> : dft_stage<T>
+{
+ fft_specialization(size_t stage_size)
+ {
+ this->stage_size = 2048;
+ this->name = dft_name(this);
+ this->temp_size = 2048 * sizeof(complex<T>);
+ this->data_size = 2048 * sizeof(complex<T>);
+ }
+
+ constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
+
+ void do_initialize(size_t) final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddle_autosort(2048, width, twiddle);
+ initialize_twiddle_autosort(512, 1, twiddle);
+ initialize_twiddle_autosort(128, 1, twiddle);
+ initialize_twiddle_autosort(32, 1, twiddle);
+ initialize_twiddle_autosort(8, 1, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ auto no = cfalse;
+ const complex<T>* tw = ptr_cast<complex<T>>(this->data);
+ complex<T>* scratch = ptr_cast<complex<T>>(temp);
+ radix4_autosort_pass_first(2048, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
+ radix4_autosort_pass(512, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix4_autosort_pass(128, 16, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw);
+ radix4_autosort_pass(32, 64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
+ radix8_autosort_pass_last(256, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
+ }
+};
+#endif
} // namespace intrinsics
template <bool is_even, bool first, typename T>
void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>, cbool_t<first>)
{
- constexpr size_t final_size = is_even ? 1024 : 512;
-
- if (stage_size >= 2048)
+ if constexpr (use_autosort)
{
- add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size);
-
- make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
+ if (stage_size >= 16)
+ {
+ add_stage<intrinsics::fft_autosort_stage_impl<T, first, false, false>>(self, stage_size,
+ self->size / stage_size);
+ make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
+ }
+ else
+ {
+ if (stage_size == 8)
+ add_stage<intrinsics::fft_autosort_stage_impl<T, false, true, true>>(self, stage_size,
+ self->size / stage_size);
+ else
+ add_stage<intrinsics::fft_autosort_stage_impl<T, false, true, false>>(
+ self, stage_size, self->size / stage_size);
+ }
}
else
{
- add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size);
+ constexpr size_t final_size = is_even ? 1024 : 512;
+
+ if (stage_size >= 2048)
+ {
+ add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(self, stage_size);
+
+ make_fft(self, stage_size / 4, cbool_t<is_even>(), cfalse);
+ }
+ else
+ {
+ add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(self, final_size);
+ add_stage<intrinsics::fft_reorder_stage_impl<T, is_even>>(self, self->size);
+ }
}
}
@@ -935,7 +1714,7 @@ KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
{
self->data = autofree<u8>(self->data_size);
size_t offset = 0;
- for (dft_stage_ptr<T>& stage : self->stages)
+ for (dft_stage_ptr<T>& stage : self->all_stages)
{
initialize_data_stage(self, stage, offset);
}
@@ -945,21 +1724,10 @@ KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
template <typename T>
KFR_INTRINSIC void initialize_order(dft_plan<T>* self)
{
- bool to_scratch = false;
- bool scratch_needed = false;
- for (dft_stage_ptr<T>& stage : reversed(self->stages))
- {
- if (to_scratch)
- {
- scratch_needed = true;
- }
- stage->to_scratch = to_scratch;
- if (!stage->can_inplace)
- {
- to_scratch = !to_scratch;
- }
- }
- if (scratch_needed || !self->stages[0]->can_inplace)
+ self->calc_disposition();
+ typename dft_plan<T>::bitset ored = self->disposition_inplace[0] | self->disposition_inplace[1] |
+ self->disposition_outofplace[0] | self->disposition_outofplace[1];
+ if (ored.any()) // if scratch needed
self->temp_size += align_up(sizeof(complex<T>) * self->size, platform<>::native_cache_alignment);
}
@@ -975,15 +1743,8 @@ KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order)
constexpr size_t log2nv = val_of(decltype(log2n)());
add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size);
},
- [&]()
- {
- cswitch(cfalse_true, is_even(log2n),
- [&](auto is_even)
- {
- make_fft(self, size, is_even, ctrue);
- constexpr size_t is_evenv = val_of(decltype(is_even)());
- add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(self, size);
- });
+ [&]() {
+ cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { make_fft(self, size, is_even, ctrue); });
});
}
@@ -1099,12 +1860,12 @@ void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const com
const cvec<T, width> fpk = cread<width>(in + i);
const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
- const cvec<T, width> f1k = fpk + fpnk;
- const cvec<T, width> f2k = fpk - fpnk;
- const cvec<T, width> t = cmul_conj(f2k, tw);
- cwrite<width>(out + i, f1k + t);
- cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
- });
+ const cvec<T, width> f1k = fpk + fpnk;
+ const cvec<T, width> f2k = fpk - fpnk;
+ const cvec<T, width> t = cmul_conj(f2k, tw);
+ cwrite<width>(out + i, f1k + t);
+ cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
+ });
if (is_even(csize))
{
size_t k = csize / 2;
@@ -1151,10 +1912,11 @@ struct dft_stage_real_repack : dft_stage<T>
public:
dft_stage_real_repack(size_t real_size, dft_pack_format fmt)
{
- this->user = static_cast<int>(fmt);
- this->stage_size = real_size;
+ this->user = static_cast<int>(fmt);
+ this->stage_size = real_size;
+ this->can_inplace = true;
const size_t count = (real_size / 2 + 1) / 2;
- this->data_size = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment);
+ this->data_size = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment);
}
void do_initialize(size_t) override
{
@@ -1162,15 +1924,15 @@ public:
constexpr size_t width = vector_width<T> * 2;
size_t real_size = this->stage_size;
complex<T>* rtwiddle = ptr_cast<complex<T>>(this->data);
- const size_t count = (real_size / 2 + 1) / 2;
+ const size_t count = (real_size / 2 + 1) / 2;
block_process(count, csizes_t<width, 1>(),
[=](size_t i, auto w)
{
constexpr size_t width = val_of(decltype(w)());
cwrite<width>(
rtwiddle + i,
- cossin(dup(-constants<T>::pi *
- ((enumerate<T, width>() + i + real_size / T(4)) / (real_size / 2)))));
+ cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + real_size / T(4)) /
+ (real_size / 2)))));
});
}
void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override
@@ -1191,10 +1953,10 @@ void dft_real_initialize(dft_plan_real<T>& plan)
if (plan.size == 0)
return;
initialize_stages(&plan);
- plan.fmt_stage.reset(new dft_stage_real_repack<T>(plan.size, plan.fmt));
- plan.data_size += plan.fmt_stage->data_size;
- size_t offset = initialize_data(&plan);
- initialize_data_stage(&plan, plan.fmt_stage, offset);
+ add_stage<dft_stage_real_repack<T>, false>(&plan, plan.size, plan.fmt);
+ plan.stages[0].push_back(plan.all_stages.back().get());
+ plan.stages[1].insert(plan.stages[1].begin(), plan.all_stages.back().get());
+ initialize_data(&plan);
initialize_order(&plan);
}
diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp
@@ -37,7 +37,6 @@
#include "../../base/memory.hpp"
#include "../data/sincos.hpp"
-
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wpass-failed")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed")
@@ -181,7 +180,7 @@ template <size_t N, bool A = false, bool split = false, typename T>
KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src)
{
cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
- if (split)
+ if constexpr (split)
temp = splitpairs(temp);
return temp;
}
@@ -190,7 +189,7 @@ template <size_t N, bool A = false, bool split = false, typename T>
KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
{
cvec<T, N> v = value;
- if (split)
+ if constexpr (split)
v = interleavehalves(v);
v.write(ptr_cast<T>(dest), cbool_t<A>());
}
@@ -278,12 +277,14 @@ KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices
template <size_t N, size_t stride, typename T>
KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base)
{
- if (stride == 1)
+ if constexpr (stride == 1)
{
return ref_cast<cvec<T, N>>(*base);
}
else
+ {
return cgather_helper<N, stride, T>(base, csizeseq_t<N>());
+ }
}
KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
@@ -342,7 +343,7 @@ KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, cs
template <size_t N, size_t stride, typename T>
KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value)
{
- if (stride == 1)
+ if constexpr (stride == 1)
{
cwrite<N>(base, value);
}
@@ -510,35 +511,35 @@ template <size_t k, size_t size, bool inverse = false, typename T, size_t width,
KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
{
constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485);
- if (kk == 0)
+ if constexpr (kk == 0)
{
return x;
}
- else if (kk == size * 1 / 8)
+ else if constexpr (kk == size * 1 / 8)
{
return swap<2>(subadd(swap<2>(x), x)) * isqrt2;
}
- else if (kk == size * 2 / 8)
+ else if constexpr (kk == size * 2 / 8)
{
return negodd(swap<2>(x));
}
- else if (kk == size * 3 / 8)
+ else if constexpr (kk == size * 3 / 8)
{
return subadd(x, swap<2>(x)) * -isqrt2;
}
- else if (kk == size * 4 / 8)
+ else if constexpr (kk == size * 4 / 8)
{
return -x;
}
- else if (kk == size * 5 / 8)
+ else if constexpr (kk == size * 5 / 8)
{
return swap<2>(subadd(swap<2>(x), x)) * -isqrt2;
}
- else if (kk == size * 6 / 8)
+ else if constexpr (kk == size * 6 / 8)
{
return swap<2>(negodd(x));
}
- else if (kk == size * 7 / 8)
+ else if constexpr (kk == size * 7 / 8)
{
return subadd(x, swap<2>(x)) * isqrt2;
}
@@ -582,7 +583,7 @@ KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, c
diff13 = high(diff0213);
w0 = sum02 + sum13;
w2 = sum02 - sum13;
- if (inverse)
+ if constexpr (inverse)
{
diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T()));
diff13 = swap<2>(diff13);
@@ -679,7 +680,7 @@ KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45,
}
template <bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8)
+KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8)
{
cvec<T, 2> w0, w1, w2, w3;
split<T, 16>(v8, w0, w1, w2, w3);
@@ -688,7 +689,7 @@ KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8)
}
template <bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32)
+KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32)
{
cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
split(v32, w0, w1, w2, w3, w4, w5, w6, w7);
@@ -710,7 +711,7 @@ KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32)
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123)
+KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123)
{
cvec<T, N> a0;
cvec<T, N> a1;
@@ -722,7 +723,7 @@ KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123)
}
template <size_t N, typename T>
-KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01)
+KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01)
{
cvec<T, N> a0;
cvec<T, N> a1;
@@ -734,14 +735,14 @@ KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01)
template <size_t N, bool inverse = false, bool split_format = false, typename T>
KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
{
- if (split_format)
+ if constexpr (split_format)
{
vec<T, N> re1, im1, tw1re, tw1im;
split<T, 2 * N>(a1, re1, im1);
split<T, 2 * N>(tw1, tw1re, tw1im);
vec<T, N> b1re = re1 * tw1re;
vec<T, N> b1im = im1 * tw1re;
- if (inverse)
+ if constexpr (inverse)
w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im);
else
w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im);
@@ -752,7 +753,7 @@ KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cv
const cvec<T, N> a1_ = swap<2>(a1);
cvec<T, N> tw1_ = tw1;
- if (inverse)
+ if constexpr (inverse)
tw1_ = -(tw1_);
w1 = subadd(b1, a1_ * dupodd(tw1_));
}
@@ -839,37 +840,38 @@ KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
}
template <bool inverse, bool aligned, typename T>
-KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
+ const complex<T>* in)
{
cvec<T, 16> w0, w1, w2, w3;
w0 = cread_group<4, 4, 16, aligned>(
in); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48));
- butterfly4<4, inverse>(w0);
+ butterfly4_packed<4, inverse>(w0);
apply_twiddles4<0, 1, 4, inverse>(w0);
w1 = cread_group<4, 4, 16, aligned>(
in + 4); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52));
- butterfly4<4, inverse>(w1);
+ butterfly4_packed<4, inverse>(w1);
apply_twiddles4<4, 1, 4, inverse>(w1);
w2 = cread_group<4, 4, 16, aligned>(
in + 8); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56));
- butterfly4<4, inverse>(w2);
+ butterfly4_packed<4, inverse>(w2);
apply_twiddles4<8, 1, 4, inverse>(w2);
w3 = cread_group<4, 4, 16, aligned>(
in + 12); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60));
- butterfly4<4, inverse>(w3);
+ butterfly4_packed<4, inverse>(w3);
apply_twiddles4<12, 1, 4, inverse>(w3);
transpose4(w0, w1, w2, w3);
// pass 2:
- butterfly4<4, inverse>(w0);
- butterfly4<4, inverse>(w1);
- butterfly4<4, inverse>(w2);
- butterfly4<4, inverse>(w3);
+ butterfly4_packed<4, inverse>(w0);
+ butterfly4_packed<4, inverse>(w1);
+ butterfly4_packed<4, inverse>(w2);
+ butterfly4_packed<4, inverse>(w3);
transpose4(w0, w1, w2, w3);
@@ -881,26 +883,26 @@ KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* o
apply_vertical_twiddles4<4, inverse>(w1, w2, w3);
// pass 3:
- butterfly4<4, inverse>(w3);
+ butterfly4_packed<4, inverse>(w3);
cwrite_group<4, 4, 16, aligned>(out + 12, w3); // split(w3, out[3], out[7], out[11], out[15]);
- butterfly4<4, inverse>(w2);
+ butterfly4_packed<4, inverse>(w2);
cwrite_group<4, 4, 16, aligned>(out + 8, w2); // split(w2, out[2], out[6], out[10], out[14]);
- butterfly4<4, inverse>(w1);
+ butterfly4_packed<4, inverse>(w1);
cwrite_group<4, 4, 16, aligned>(out + 4, w1); // split(w1, out[1], out[5], out[9], out[13]);
- butterfly4<4, inverse>(w0);
+ butterfly4_packed<4, inverse>(w0);
cwrite_group<4, 4, 16, aligned>(out, w0); // split(w0, out[0], out[4], out[8], out[12]);
}
template <bool inverse = false, typename T>
-KFR_INTRINSIC void butterfly16(cvec<T, 16>& v16)
+KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16)
{
- butterfly4<4, inverse>(v16);
+ butterfly4_packed<4, inverse>(v16);
apply_twiddles4<0, 4, 4, inverse>(v16);
v16 = digitreverse4<2>(v16);
- butterfly4<4, inverse>(v16);
+ butterfly4_packed<4, inverse>(v16);
}
template <size_t index, bool inverse = false, typename T>
@@ -1505,7 +1507,7 @@ template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N
KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
{
vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr));
- if (transposed)
+ if constexpr (transposed)
temp = ctranspose<sizeof...(N)>(temp);
split(temp, w...);
}
@@ -1533,7 +1535,7 @@ template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N
KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
{
auto temp = concat(args...);
- if (transposed)
+ if constexpr (transposed)
temp = ctransposeinverse<sizeof...(N)>(temp);
write(ptr_cast<T>(ptr), temp);
}
@@ -1634,7 +1636,7 @@ KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t
const cvec<T, 1> ina = cread<1>(in + (1 + j));
const cvec<T, 1> inb = cread<1>(in + radix - (j + 1));
cvec<T, width> tw = cread<width>(twiddle);
- if (inverse)
+ if constexpr (inverse)
tw = negodd /*cconj*/ (tw);
cmul_2conj(sum0, sum1, ina, inb, tw);
@@ -1642,9 +1644,6 @@ KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t
}
twiddle = twiddle - halfradix_sqr + width;
- // if (inverse)
- // std::swap(sum0, sum1);
-
if (is_constant_val(ostride))
{
cwrite<width>(out + (1 + i), sum0);
diff --git a/include/kfr/simd/digitreverse.hpp b/include/kfr/simd/digitreverse.hpp
@@ -39,29 +39,37 @@ CMT_PRAGMA_GNU(GCC diagnostic push)
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
+constexpr KFR_INTRINSIC u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
template <u32 m, u32 shift, u32... values>
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
+constexpr KFR_INTRINSIC u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
{
return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>());
}
template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>)
+constexpr KFR_INTRINSIC u32 digitreverse_impl(u32 x, csize_t<2>)
{
- return bit_permute_step_impl(
- x,
- cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
- (32 - bits);
+ constexpr cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>
+ steps{};
+ if constexpr (bits > 16)
+ return bit_permute_step_impl(x, steps) >> (32 - bits);
+ else if constexpr (bits > 8)
+ return bit_permute_step_impl(x, steps[csizeseq<8>]) >> (16 - bits);
+ else
+ return bit_permute_step_impl(x, steps[csizeseq<6>]) >> (8 - bits);
}
template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>)
+constexpr KFR_INTRINSIC u32 digitreverse_impl(u32 x, csize_t<4>)
{
- return bit_permute_step_impl(
- x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
- (32 - bits);
+ constexpr cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16> steps{};
+ if constexpr (bits > 16)
+ return bit_permute_step_impl(x, steps) >> (32 - bits);
+ else if constexpr (bits > 8)
+ return bit_permute_step_impl(x, steps[csizeseq<6>]) >> (16 - bits);
+ else
+ return bit_permute_step_impl(x, steps[csizeseq<4>]) >> (8 - bits);
}
CMT_PRAGMA_GNU(GCC diagnostic pop)
@@ -69,7 +77,7 @@ CMT_PRAGMA_GNU(GCC diagnostic pop)
template <size_t radix, size_t bits>
struct shuffle_index_digitreverse
{
- constexpr inline size_t operator()(size_t index) const CMT_NOEXCEPT
+ constexpr KFR_INTRINSIC size_t operator()(size_t index) const CMT_NOEXCEPT
{
return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>());
}
@@ -96,13 +104,13 @@ KFR_INTRINSIC vec<T, N> digitreverse4(const vec<T, N>& x)
}
template <size_t bits>
-constexpr inline u32 bitreverse(u32 x)
+constexpr KFR_INTRINSIC u32 bitreverse(u32 x)
{
return internal::digitreverse_impl<bits>(x, csize_t<2>());
}
template <size_t bits>
-constexpr inline u32 digitreverse4(u32 x)
+constexpr KFR_INTRINSIC u32 digitreverse4(u32 x)
{
return internal::digitreverse_impl<bits>(x, csize_t<4>());
}
diff --git a/include/kfr/simd/impl/specializations.hpp b/include/kfr/simd/impl/specializations.hpp
@@ -22,22 +22,20 @@
#include "../shuffle.hpp"
#endif
-#ifdef KFR_COMPILER_GNU
+#ifdef CMT_COMPILER_GNU
namespace kfr
{
inline namespace CMT_ARCH_NAME
{
-namespace intrinsics
-{
template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
+template <>
+inline vec<f32, 32> vec<f32, 32>::shuffle(
csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14,
- 15, 22, 23, 30, 31>,
- const vec<f32, 32>& x, const vec<f32, 32>&)
+ 15, 22, 23, 30, 31>) const CMT_NOEXCEPT
{
- f32x32 w = x;
+ f32x32 w = *this;
w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)),
permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w)));
@@ -47,12 +45,12 @@ inline vec<f32, 32> shufflevector<f32, 32>(
}
template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
+template <>
+inline vec<f32, 32> vec<f32, 32>::shuffle(
csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22,
- 23, 14, 15, 30, 31>,
- const vec<f32, 32>& x, const vec<f32, 32>&)
+ 23, 14, 15, 30, 31>) const CMT_NOEXCEPT
{
- f32x32 w = x;
+ f32x32 w = *this;
w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)),
permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w)));
@@ -63,54 +61,55 @@ inline vec<f32, 32> shufflevector<f32, 32>(
inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x)
{
- return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18,
- 19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>,
- x, x);
+ return x.shuffle(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11,
+ 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>);
}
template <>
-inline vec<f32, 64> shufflevector<f32, 64>(
+template <>
+inline vec<f32, 64> vec<f32, 64>::shuffle(
csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13,
44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38,
- 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>,
- const vec<f32, 64>& x, const vec<f32, 64>&)
+ 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>) const CMT_NOEXCEPT
{
return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(
- concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x))));
+ concat(bitreverse_2(even<8>(*this)), bitreverse_2(odd<8>(*this))));
}
template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>,
- const vec<f32, 16>& x, const vec<f32, 16>&)
+template <>
+inline vec<f32, 16> vec<f32, 16>::shuffle(
+ csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>) const CMT_NOEXCEPT
{
- // asm volatile("int $3");
- const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x);
+ const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(*this);
- return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)),
- shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx)));
+ return concat(low(xx).shuffle(high(xx), csizes<0, 2, 8 + 0, 8 + 2, 4, 6, 8 + 4, 8 + 6>),
+ low(xx).shuffle(high(xx), csizes<1, 3, 8 + 1, 8 + 3, 5, 7, 8 + 5, 8 + 7>));
}
template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>,
- const vec<f32, 16>& x, const vec<f32, 16>&)
+template <>
+inline vec<f32, 16> vec<f32, 16>::shuffle(
+ csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>) const CMT_NOEXCEPT
{
const vec<f32, 16> xx =
- concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
+ concat(low(*this).shuffle(high(*this), csizes<0, 8 + 0, 1, 8 + 1, 4, 8 + 4, 5, 8 + 5>),
+ low(*this).shuffle(high(*this), csizes<2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7>));
return permutegroups<(4), 0, 2, 1, 3>(xx);
}
template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
+template <>
+inline vec<f32, 32> vec<f32, 32>::shuffle(
csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
- 29, 14, 30, 15, 31>,
- const vec<f32, 32>& x, const vec<f32, 32>&)
+ 29, 14, 30, 15, 31>) const CMT_NOEXCEPT
{
- const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x);
+ const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(*this);
return concat(interleavehalves(low(xx)), interleavehalves(high(xx)));
}
-} // namespace intrinsics
+
} // namespace CMT_ARCH_NAME
} // namespace kfr
#endif
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -18,6 +18,15 @@ using namespace kfr;
namespace CMT_ARCH_NAME
{
+TEST(print_vector_capacity)
+{
+ println("vector_capacity<float> = ", vector_capacity<float>);
+ println("vector_capacity<double> = ", vector_capacity<double>);
+
+ println("fft_config<float>::process_width = ", vector_capacity<float> / 16);
+ println("fft_config<double>::process_width = ", vector_capacity<double> / 16);
+}
+
#ifdef CMT_NATIVE_F64
constexpr ctypes_t<float, double> dft_float_types{};
#else
@@ -78,7 +87,7 @@ static void perf_test(int size)
TEST(test_performance)
{
- for (int size = 16; size <= 16384; size <<= 1)
+ for (int size = 16; size <= 65536; size <<= 1)
{
perf_test(size);
}
@@ -160,7 +169,7 @@ constexpr size_t fft_stopsize = 12;
constexpr size_t dft_stopsize = 101;
#endif
#else
-constexpr size_t fft_stopsize = 20;
+constexpr size_t fft_stopsize = 21;
#ifndef KFR_DFT_NO_NPo2
constexpr size_t dft_stopsize = 257;
#endif
@@ -183,15 +192,14 @@ TEST(fft_real)
TEST(fft_real_not_size_4N)
{
kfr::univector<double, 6> in = counter();
- auto out = realdft(in);
- kfr::univector<kfr::complex<double>> expected {
- 15.0, { -3, 5.19615242}, {-3, +1.73205081}, -3.0 };
+ auto out = realdft(in);
+ kfr::univector<kfr::complex<double>> expected{ 15.0, { -3, 5.19615242 }, { -3, +1.73205081 }, -3.0 };
CHECK(rms(cabs(out - expected)) <= 0.00001f);
kfr::univector<double, 6> rev = irealdft(out) / 6;
CHECK(rms(rev - in) <= 0.00001f);
- random_state gen = random_init(2247448713, 915890490, 864203735, 2982561);
- constexpr size_t size = 66;
+ random_state gen = random_init(2247448713, 915890490, 864203735, 2982561);
+ constexpr size_t size = 66;
kfr::univector<double, size> in2 = gen_random_range<double>(gen, -1.0, +1.0);
kfr::univector<kfr::complex<double>, size / 2 + 1> out2 = realdft(in2);
kfr::univector<double, size> rev2 = irealdft(out2) / size;