kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

bitrev.hpp (17513B)


      1 /** @addtogroup dft
      2  *  @{
      3  */
      4 /*
      5   Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com)
      6   This file is part of KFR
      7 
      8   KFR is free software: you can redistribute it and/or modify
      9   it under the terms of the GNU General Public License as published by
     10   the Free Software Foundation, either version 2 of the License, or
     11   (at your option) any later version.
     12 
     13   KFR is distributed in the hope that it will be useful,
     14   but WITHOUT ANY WARRANTY; without even the implied warranty of
     15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16   GNU General Public License for more details.
     17 
     18   You should have received a copy of the GNU General Public License
     19   along with KFR.
     20 
     21   If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
     22   Buying a commercial license is mandatory as soon as you develop commercial activities without
     23   disclosing the source code of your own applications.
     24   See https://www.kfrlib.com for details.
     25  */
     26 #pragma once
     27 
     28 #include <kfr/simd/complex.hpp>
     29 #include <kfr/simd/constants.hpp>
     30 #include <kfr/simd/digitreverse.hpp>
     31 #include <kfr/simd/vec.hpp>
     32 
     33 #include "data/bitrev.hpp"
     34 
     35 #include "ft.hpp"
     36 
     37 namespace kfr
     38 {
     39 inline namespace CMT_ARCH_NAME
     40 {
     41 
     42 namespace intrinsics
     43 {
     44 
     45 constexpr inline static bool fft_reorder_aligned = false;
     46 
     47 constexpr inline static size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table));
     48 
     49 template <size_t Bits>
     50 CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x)
     51 {
     52 #ifdef CMT_ARCH_NEON
     53     return __builtin_bitreverse32(x) >> (32 - Bits);
     54 #else
     55     if constexpr (Bits > bitrev_table_log2N)
     56         return bitreverse<Bits>(x);
     57 
     58     return data::bitrev_table[x] >> (bitrev_table_log2N - Bits);
     59 #endif
     60 }
     61 
     62 template <bool use_table>
     63 CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits, cbool_t<use_table>)
     64 {
     65 #ifdef CMT_ARCH_NEON
     66     return __builtin_bitreverse32(x) >> (32 - bits);
     67 #else
     68     if constexpr (use_table)
     69     {
     70         return data::bitrev_table[x] >> (bitrev_table_log2N - bits);
     71     }
     72     else
     73     {
     74         return bitreverse<32>(x) >> (32 - bits);
     75     }
     76 #endif
     77 }
     78 
     79 CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits)
     80 {
     81 #ifdef CMT_ARCH_NEON
     82     x = __builtin_bitreverse32(x);
     83     x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
     84     x = x >> (32 - bits);
     85     return x;
     86 #else
     87     if (bits > bitrev_table_log2N)
     88     {
     89         if (bits <= 16)
     90             return digitreverse4<16>(x) >> (16 - bits);
     91         else
     92             return digitreverse4<32>(x) >> (32 - bits);
     93     }
     94 
     95     x = data::bitrev_table[x];
     96     x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
     97     x = x >> (bitrev_table_log2N - bits);
     98     return x;
     99 #endif
    100 }
    101 
    102 template <size_t log2n, size_t bitrev, typename T>
    103 KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i)
    104 {
    105     using cxx           = cvec<T, 16>;
    106     constexpr size_t N  = 1 << log2n;
    107     constexpr size_t N4 = 2 * N / 4;
    108 
    109     cxx vi = cread_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i));
    110     vi     = digitreverse<bitrev, 2>(vi);
    111     cwrite_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), vi);
    112 }
    113 
    114 template <size_t log2n, size_t bitrev, typename T>
    115 KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j)
    116 {
    117     CMT_ASSUME(i != j);
    118     using cxx           = cvec<T, 16>;
    119     constexpr size_t N  = 1 << log2n;
    120     constexpr size_t N4 = 2 * N / 4;
    121 
    122     cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2);
    123     cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2);
    124 
    125     vi = digitreverse<bitrev, 2>(vi);
    126     cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vi);
    127     vj = digitreverse<bitrev, 2>(vj);
    128     cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vj);
    129 }
    130 
    131 template <size_t log2n, size_t bitrev, typename T>
    132 KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j)
    133 {
    134     CMT_ASSUME(i != j);
    135     using cxx           = cvec<T, 16>;
    136     constexpr size_t N  = 1 << log2n;
    137     constexpr size_t N4 = 2 * N / 4;
    138 
    139     cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2);
    140     cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2);
    141 
    142     vi = digitreverse<bitrev, 2>(vi);
    143     cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vi);
    144     vj = digitreverse<bitrev, 2>(vj);
    145     cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vj);
    146 }
    147 
    148 template <size_t log2n, size_t bitrev, typename T>
    149 KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i)
    150 {
    151     fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2);
    152 }
    153 
    154 template <size_t log2n, size_t bitrev, typename T>
    155 KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1)
    156 {
    157     fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2);
    158 }
    159 
    160 template <size_t log2n, size_t bitrev, typename T>
    161 KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j)
    162 {
    163     fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2);
    164 }
    165 
    166 template <typename T>
    167 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>)
    168 {
    169     fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4);
    170     fft_reorder_swap<11>(inout, 1 * 4, 64 * 4);
    171     fft_reorder_swap<11>(inout, 2 * 4, 32 * 4);
    172     fft_reorder_swap<11>(inout, 3 * 4, 96 * 4);
    173     fft_reorder_swap<11>(inout, 4 * 4, 16 * 4);
    174     fft_reorder_swap<11>(inout, 5 * 4, 80 * 4);
    175     fft_reorder_swap<11>(inout, 6 * 4, 48 * 4);
    176     fft_reorder_swap<11>(inout, 7 * 4, 112 * 4);
    177     fft_reorder_swap<11>(inout, 9 * 4, 72 * 4);
    178     fft_reorder_swap<11>(inout, 10 * 4, 40 * 4);
    179     fft_reorder_swap<11>(inout, 11 * 4, 104 * 4);
    180     fft_reorder_swap<11>(inout, 12 * 4, 24 * 4);
    181     fft_reorder_swap<11>(inout, 13 * 4, 88 * 4);
    182     fft_reorder_swap<11>(inout, 14 * 4, 56 * 4);
    183     fft_reorder_swap<11>(inout, 15 * 4, 120 * 4);
    184     fft_reorder_swap<11>(inout, 17 * 4, 68 * 4);
    185     fft_reorder_swap<11>(inout, 18 * 4, 36 * 4);
    186     fft_reorder_swap<11>(inout, 19 * 4, 100 * 4);
    187     fft_reorder_swap_two<11>(inout, 20 * 4, 28 * 4);
    188     fft_reorder_swap<11>(inout, 21 * 4, 84 * 4);
    189     fft_reorder_swap<11>(inout, 22 * 4, 52 * 4);
    190     fft_reorder_swap<11>(inout, 23 * 4, 116 * 4);
    191     fft_reorder_swap<11>(inout, 25 * 4, 76 * 4);
    192     fft_reorder_swap<11>(inout, 26 * 4, 44 * 4);
    193     fft_reorder_swap<11>(inout, 27 * 4, 108 * 4);
    194     fft_reorder_swap<11>(inout, 29 * 4, 92 * 4);
    195     fft_reorder_swap<11>(inout, 30 * 4, 60 * 4);
    196     fft_reorder_swap<11>(inout, 31 * 4, 124 * 4);
    197     fft_reorder_swap<11>(inout, 33 * 4, 66 * 4);
    198     fft_reorder_swap_two<11>(inout, 34 * 4, 42 * 4);
    199     fft_reorder_swap<11>(inout, 35 * 4, 98 * 4);
    200     fft_reorder_swap<11>(inout, 37 * 4, 82 * 4);
    201     fft_reorder_swap<11>(inout, 38 * 4, 50 * 4);
    202     fft_reorder_swap<11>(inout, 39 * 4, 114 * 4);
    203     fft_reorder_swap<11>(inout, 41 * 4, 74 * 4);
    204     fft_reorder_swap<11>(inout, 43 * 4, 106 * 4);
    205     fft_reorder_swap<11>(inout, 45 * 4, 90 * 4);
    206     fft_reorder_swap<11>(inout, 46 * 4, 58 * 4);
    207     fft_reorder_swap<11>(inout, 47 * 4, 122 * 4);
    208     fft_reorder_swap<11>(inout, 49 * 4, 70 * 4);
    209     fft_reorder_swap<11>(inout, 51 * 4, 102 * 4);
    210     fft_reorder_swap<11>(inout, 53 * 4, 86 * 4);
    211     fft_reorder_swap_two<11>(inout, 54 * 4, 62 * 4);
    212     fft_reorder_swap<11>(inout, 55 * 4, 118 * 4);
    213     fft_reorder_swap<11>(inout, 57 * 4, 78 * 4);
    214     fft_reorder_swap<11>(inout, 59 * 4, 110 * 4);
    215     fft_reorder_swap<11>(inout, 61 * 4, 94 * 4);
    216     fft_reorder_swap<11>(inout, 63 * 4, 126 * 4);
    217     fft_reorder_swap_two<11>(inout, 65 * 4, 73 * 4);
    218     fft_reorder_swap<11>(inout, 67 * 4, 97 * 4);
    219     fft_reorder_swap<11>(inout, 69 * 4, 81 * 4);
    220     fft_reorder_swap<11>(inout, 71 * 4, 113 * 4);
    221     fft_reorder_swap<11>(inout, 75 * 4, 105 * 4);
    222     fft_reorder_swap<11>(inout, 77 * 4, 89 * 4);
    223     fft_reorder_swap<11>(inout, 79 * 4, 121 * 4);
    224     fft_reorder_swap<11>(inout, 83 * 4, 101 * 4);
    225     fft_reorder_swap_two<11>(inout, 85 * 4, 93 * 4);
    226     fft_reorder_swap<11>(inout, 87 * 4, 117 * 4);
    227     fft_reorder_swap<11>(inout, 91 * 4, 109 * 4);
    228     fft_reorder_swap<11>(inout, 95 * 4, 125 * 4);
    229     fft_reorder_swap_two<11>(inout, 99 * 4, 107 * 4);
    230     fft_reorder_swap<11>(inout, 103 * 4, 115 * 4);
    231     fft_reorder_swap<11>(inout, 111 * 4, 123 * 4);
    232     fft_reorder_swap_two<11>(inout, 119 * 4, 127 * 4);
    233 }
    234 
    235 template <typename T>
    236 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>)
    237 {
    238     constexpr size_t bitrev = 2;
    239     fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4);
    240     fft_reorder_swap<7, bitrev>(inout, 1 * 4, 4 * 4);
    241     fft_reorder_swap<7, bitrev>(inout, 3 * 4, 6 * 4);
    242     fft_reorder_swap_two<7, bitrev>(inout, 5 * 4, 7 * 4);
    243 }
    244 
    245 template <typename T>
    246 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */)
    247 {
    248     constexpr size_t bitrev = 4;
    249     fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4);
    250     fft_reorder_swap<8, bitrev>(inout, 1 * 4, 4 * 4);
    251     fft_reorder_swap<8, bitrev>(inout, 2 * 4, 8 * 4);
    252     fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4);
    253     fft_reorder_swap<8, bitrev>(inout, 6 * 4, 9 * 4);
    254     fft_reorder_swap<8, bitrev>(inout, 7 * 4, 13 * 4);
    255     fft_reorder_swap_two<8, bitrev>(inout, 10 * 4, 15 * 4);
    256     fft_reorder_swap<8, bitrev>(inout, 11 * 4, 14 * 4);
    257 }
    258 
    259 template <typename T>
    260 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */)
    261 {
    262     constexpr size_t bitrev = 2;
    263     fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4);
    264     fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4);
    265     fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4);
    266     fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4);
    267     fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4);
    268     fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4);
    269     fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4);
    270     fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4);
    271 }
    272 
    273 template <typename T>
    274 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
    275 {
    276     constexpr size_t bitrev = 2;
    277     fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4);
    278     fft_reorder_swap<9, bitrev>(inout, 1 * 4, 16 * 4);
    279     fft_reorder_swap<9, bitrev>(inout, 2 * 4, 8 * 4);
    280     fft_reorder_swap<9, bitrev>(inout, 3 * 4, 24 * 4);
    281     fft_reorder_swap<9, bitrev>(inout, 5 * 4, 20 * 4);
    282     fft_reorder_swap<9, bitrev>(inout, 6 * 4, 12 * 4);
    283     fft_reorder_swap<9, bitrev>(inout, 7 * 4, 28 * 4);
    284     fft_reorder_swap<9, bitrev>(inout, 9 * 4, 18 * 4);
    285     fft_reorder_swap_two<9, bitrev>(inout, 10 * 4, 14 * 4);
    286     fft_reorder_swap<9, bitrev>(inout, 11 * 4, 26 * 4);
    287     fft_reorder_swap<9, bitrev>(inout, 13 * 4, 22 * 4);
    288     fft_reorder_swap<9, bitrev>(inout, 15 * 4, 30 * 4);
    289     fft_reorder_swap_two<9, bitrev>(inout, 17 * 4, 21 * 4);
    290     fft_reorder_swap<9, bitrev>(inout, 19 * 4, 25 * 4);
    291     fft_reorder_swap<9, bitrev>(inout, 23 * 4, 29 * 4);
    292     fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4);
    293 }
    294 
    295 template <typename T>
    296 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */)
    297 {
    298     constexpr size_t bitrev = 2;
    299     fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4);
    300     fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4);
    301     fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4);
    302     fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4);
    303     fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4);
    304     fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4);
    305     fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4);
    306     fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4);
    307     fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4);
    308     fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4);
    309     fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4);
    310     fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4);
    311     fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4);
    312     fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4);
    313     fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4);
    314     fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4);
    315     fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4);
    316     fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4);
    317     fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4);
    318     fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4);
    319     fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4);
    320     fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4);
    321     fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4);
    322     fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4);
    323     fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4);
    324     fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4);
    325     fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4);
    326     fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4);
    327     fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4);
    328     fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4);
    329     fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4);
    330     fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4);
    331 }
    332 
    333 template <typename T, bool use_br2>
    334 KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
    335 {
    336     cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4,
    337                                             digitreverse<(use_br2 ? 2 : 4), 2>(value));
    338 }
    339 
    340 template <typename T, bool use_br2>
    341 KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>)
    342 {
    343     CMT_ASSUME(i != j);
    344     const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4);
    345     const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4);
    346     cwrite_reordered(inout + j, vi, N4, cbool_t<use_br2>());
    347     cwrite_reordered(inout + i, vj, N4, cbool_t<use_br2>());
    348 }
    349 
    350 template <typename T, bool use_table>
    351 KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2, cbool_t<use_table>)
    352 {
    353     const size_t N         = size_t(1) << log2n;
    354     const size_t N4        = N / 4;
    355     const size_t iend      = N / 16 * 4 * 2;
    356     constexpr size_t istep = 2 * 4;
    357     const size_t jstep1    = (1 << (log2n - 5)) * 4 * 2;
    358     const size_t jstep2 = size_t(size_t(1) << (log2n - 5)) * 4 * 2 - size_t(size_t(1) << (log2n - 6)) * 4 * 2;
    359     T* io               = ptr_cast<T>(inout);
    360 
    361     for (size_t i = 0; i < iend;)
    362     {
    363         size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4, cbool<use_table>) << 3;
    364         if (i >= j)
    365         {
    366             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    367         }
    368         else
    369         {
    370             i += 4 * istep;
    371             continue;
    372         }
    373         i += istep;
    374         j = j + jstep1;
    375 
    376         if (i >= j)
    377         {
    378             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    379         }
    380         i += istep;
    381         j = j - jstep2;
    382 
    383         if (i >= j)
    384         {
    385             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    386         }
    387         i += istep;
    388         j = j + jstep1;
    389 
    390         if (i >= j)
    391         {
    392             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    393         }
    394         i += istep;
    395     }
    396 }
    397 
    398 template <typename T>
    399 KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
    400 {
    401     if (log2n - 4 > bitrev_table_log2N)
    402     {
    403         fft_reorder(inout, log2n, ctrue, cfalse);
    404     }
    405     else
    406     {
    407         fft_reorder(inout, log2n, ctrue, ctrue);
    408     }
    409 }
    410 
    411 template <typename T>
    412 KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
    413 {
    414     const size_t N         = size_t(1) << log2n;
    415     const size_t N4        = N / 4;
    416     const size_t N16       = N * 2 / 16;
    417     size_t iend            = N16;
    418     constexpr size_t istep = 2 * 4;
    419     const size_t jstep     = N / 64 * 4 * 2;
    420     T* io                  = ptr_cast<T>(inout);
    421 
    422     size_t i = 0;
    423     CMT_PRAGMA_CLANG(clang loop unroll_count(2))
    424     for (; i < iend;)
    425     {
    426         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
    427 
    428         if (i >= j)
    429             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    430         i += istep * 4;
    431     }
    432     iend += N16;
    433     CMT_PRAGMA_CLANG(clang loop unroll_count(2))
    434     for (; i < iend;)
    435     {
    436         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
    437 
    438         fft_reorder_swap_n4(io, i, j, N4, use_br2);
    439 
    440         i += istep;
    441         j = j + jstep;
    442 
    443         if (i >= j)
    444             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    445         i += istep * 3;
    446     }
    447     iend += N16;
    448     CMT_PRAGMA_CLANG(clang loop unroll_count(2))
    449     for (; i < iend;)
    450     {
    451         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
    452 
    453         fft_reorder_swap_n4(io, i, j, N4, use_br2);
    454 
    455         i += istep;
    456         j = j + jstep;
    457 
    458         fft_reorder_swap_n4(io, i, j, N4, use_br2);
    459 
    460         i += istep;
    461         j = j + jstep;
    462 
    463         if (i >= j)
    464             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    465         i += istep * 2;
    466     }
    467     iend += N16;
    468     CMT_PRAGMA_CLANG(clang loop unroll_count(2))
    469     for (; i < iend;)
    470     {
    471         size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3;
    472 
    473         fft_reorder_swap_n4(io, i, j, N4, use_br2);
    474 
    475         i += istep;
    476         j = j + jstep;
    477 
    478         fft_reorder_swap_n4(io, i, j, N4, use_br2);
    479 
    480         i += istep;
    481         j = j + jstep;
    482 
    483         fft_reorder_swap_n4(io, i, j, N4, use_br2);
    484 
    485         i += istep;
    486         j = j + jstep;
    487 
    488         if (i >= j)
    489             fft_reorder_swap_n4(io, i, j, N4, use_br2);
    490         i += istep;
    491     }
    492 }
    493 } // namespace intrinsics
    494 } // namespace CMT_ARCH_NAME
    495 } // namespace kfr