bitrev.hpp (17513B)
1 /** @addtogroup dft 2 * @{ 3 */ 4 /* 5 Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) 6 This file is part of KFR 7 8 KFR is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 2 of the License, or 11 (at your option) any later version. 12 13 KFR is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with KFR. 20 21 If GPL is not suitable for your project, you must purchase a commercial license to use KFR. 22 Buying a commercial license is mandatory as soon as you develop commercial activities without 23 disclosing the source code of your own applications. 24 See https://www.kfrlib.com for details. 25 */ 26 #pragma once 27 28 #include <kfr/simd/complex.hpp> 29 #include <kfr/simd/constants.hpp> 30 #include <kfr/simd/digitreverse.hpp> 31 #include <kfr/simd/vec.hpp> 32 33 #include "data/bitrev.hpp" 34 35 #include "ft.hpp" 36 37 namespace kfr 38 { 39 inline namespace CMT_ARCH_NAME 40 { 41 42 namespace intrinsics 43 { 44 45 constexpr inline static bool fft_reorder_aligned = false; 46 47 constexpr inline static size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table)); 48 49 template <size_t Bits> 50 CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x) 51 { 52 #ifdef CMT_ARCH_NEON 53 return __builtin_bitreverse32(x) >> (32 - Bits); 54 #else 55 if constexpr (Bits > bitrev_table_log2N) 56 return bitreverse<Bits>(x); 57 58 return data::bitrev_table[x] >> (bitrev_table_log2N - Bits); 59 #endif 60 } 61 62 template <bool use_table> 63 CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits, cbool_t<use_table>) 64 { 65 #ifdef CMT_ARCH_NEON 66 return __builtin_bitreverse32(x) >> (32 - bits); 67 #else 68 if constexpr (use_table) 69 { 70 return data::bitrev_table[x] >> (bitrev_table_log2N - bits); 71 } 72 else 73 { 74 return bitreverse<32>(x) >> (32 - bits); 75 } 76 #endif 77 } 78 79 CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits) 80 { 81 #ifdef CMT_ARCH_NEON 82 x = __builtin_bitreverse32(x); 83 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); 84 x = x >> (32 - bits); 85 return x; 86 #else 87 if (bits > bitrev_table_log2N) 88 { 89 if (bits <= 16) 90 return digitreverse4<16>(x) >> (16 - bits); 91 else 92 return digitreverse4<32>(x) >> (32 - bits); 93 } 94 95 x = data::bitrev_table[x]; 96 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); 97 x = x >> (bitrev_table_log2N - bits); 98 return x; 99 #endif 100 } 101 102 template <size_t log2n, size_t bitrev, typename T> 103 KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i) 104 { 105 using cxx = cvec<T, 16>; 106 constexpr size_t N = 1 << log2n; 107 constexpr size_t N4 = 2 * N / 4; 108 109 cxx vi = cread_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i)); 110 vi = digitreverse<bitrev, 2>(vi); 111 cwrite_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), vi); 112 } 113 114 template <size_t log2n, size_t bitrev, typename T> 115 KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j) 116 { 117 CMT_ASSUME(i != j); 118 using cxx = cvec<T, 16>; 119 constexpr size_t N = 1 << log2n; 120 constexpr size_t N4 = 2 * N / 4; 121 122 cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2); 123 cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2); 124 125 vi = digitreverse<bitrev, 2>(vi); 126 cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vi); 127 vj = digitreverse<bitrev, 2>(vj); 128 cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vj); 129 } 130 131 template <size_t log2n, size_t bitrev, typename T> 132 KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j) 133 { 134 CMT_ASSUME(i != j); 135 using cxx = cvec<T, 16>; 136 constexpr size_t N = 1 << log2n; 137 constexpr size_t N4 = 2 * N / 4; 138 139 cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2); 140 cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2); 141 142 vi = digitreverse<bitrev, 2>(vi); 143 cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vi); 144 vj = digitreverse<bitrev, 2>(vj); 145 cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vj); 146 } 147 148 template <size_t log2n, size_t bitrev, typename T> 149 KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i) 150 { 151 fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2); 152 } 153 154 template <size_t log2n, size_t bitrev, typename T> 155 KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1) 156 { 157 fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2); 158 } 159 160 template <size_t log2n, size_t bitrev, typename T> 161 KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j) 162 { 163 fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2); 164 } 165 166 template <typename T> 167 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>) 168 { 169 fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4); 170 fft_reorder_swap<11>(inout, 1 * 4, 64 * 4); 171 fft_reorder_swap<11>(inout, 2 * 4, 32 * 4); 172 fft_reorder_swap<11>(inout, 3 * 4, 96 * 4); 173 fft_reorder_swap<11>(inout, 4 * 4, 16 * 4); 174 fft_reorder_swap<11>(inout, 5 * 4, 80 * 4); 175 fft_reorder_swap<11>(inout, 6 * 4, 48 * 4); 176 fft_reorder_swap<11>(inout, 7 * 4, 112 * 4); 177 fft_reorder_swap<11>(inout, 9 * 4, 72 * 4); 178 fft_reorder_swap<11>(inout, 10 * 4, 40 * 4); 179 fft_reorder_swap<11>(inout, 11 * 4, 104 * 4); 180 fft_reorder_swap<11>(inout, 12 * 4, 24 * 4); 181 fft_reorder_swap<11>(inout, 13 * 4, 88 * 4); 182 fft_reorder_swap<11>(inout, 14 * 4, 56 * 4); 183 fft_reorder_swap<11>(inout, 15 * 4, 120 * 4); 184 fft_reorder_swap<11>(inout, 17 * 4, 68 * 4); 185 fft_reorder_swap<11>(inout, 18 * 4, 36 * 4); 186 fft_reorder_swap<11>(inout, 19 * 4, 100 * 4); 187 fft_reorder_swap_two<11>(inout, 20 * 4, 28 * 4); 188 fft_reorder_swap<11>(inout, 21 * 4, 84 * 4); 189 fft_reorder_swap<11>(inout, 22 * 4, 52 * 4); 190 fft_reorder_swap<11>(inout, 23 * 4, 116 * 4); 191 fft_reorder_swap<11>(inout, 25 * 4, 76 * 4); 192 fft_reorder_swap<11>(inout, 26 * 4, 44 * 4); 193 fft_reorder_swap<11>(inout, 27 * 4, 108 * 4); 194 fft_reorder_swap<11>(inout, 29 * 4, 92 * 4); 195 fft_reorder_swap<11>(inout, 30 * 4, 60 * 4); 196 fft_reorder_swap<11>(inout, 31 * 4, 124 * 4); 197 fft_reorder_swap<11>(inout, 33 * 4, 66 * 4); 198 fft_reorder_swap_two<11>(inout, 34 * 4, 42 * 4); 199 fft_reorder_swap<11>(inout, 35 * 4, 98 * 4); 200 fft_reorder_swap<11>(inout, 37 * 4, 82 * 4); 201 fft_reorder_swap<11>(inout, 38 * 4, 50 * 4); 202 fft_reorder_swap<11>(inout, 39 * 4, 114 * 4); 203 fft_reorder_swap<11>(inout, 41 * 4, 74 * 4); 204 fft_reorder_swap<11>(inout, 43 * 4, 106 * 4); 205 fft_reorder_swap<11>(inout, 45 * 4, 90 * 4); 206 fft_reorder_swap<11>(inout, 46 * 4, 58 * 4); 207 fft_reorder_swap<11>(inout, 47 * 4, 122 * 4); 208 fft_reorder_swap<11>(inout, 49 * 4, 70 * 4); 209 fft_reorder_swap<11>(inout, 51 * 4, 102 * 4); 210 fft_reorder_swap<11>(inout, 53 * 4, 86 * 4); 211 fft_reorder_swap_two<11>(inout, 54 * 4, 62 * 4); 212 fft_reorder_swap<11>(inout, 55 * 4, 118 * 4); 213 fft_reorder_swap<11>(inout, 57 * 4, 78 * 4); 214 fft_reorder_swap<11>(inout, 59 * 4, 110 * 4); 215 fft_reorder_swap<11>(inout, 61 * 4, 94 * 4); 216 fft_reorder_swap<11>(inout, 63 * 4, 126 * 4); 217 fft_reorder_swap_two<11>(inout, 65 * 4, 73 * 4); 218 fft_reorder_swap<11>(inout, 67 * 4, 97 * 4); 219 fft_reorder_swap<11>(inout, 69 * 4, 81 * 4); 220 fft_reorder_swap<11>(inout, 71 * 4, 113 * 4); 221 fft_reorder_swap<11>(inout, 75 * 4, 105 * 4); 222 fft_reorder_swap<11>(inout, 77 * 4, 89 * 4); 223 fft_reorder_swap<11>(inout, 79 * 4, 121 * 4); 224 fft_reorder_swap<11>(inout, 83 * 4, 101 * 4); 225 fft_reorder_swap_two<11>(inout, 85 * 4, 93 * 4); 226 fft_reorder_swap<11>(inout, 87 * 4, 117 * 4); 227 fft_reorder_swap<11>(inout, 91 * 4, 109 * 4); 228 fft_reorder_swap<11>(inout, 95 * 4, 125 * 4); 229 fft_reorder_swap_two<11>(inout, 99 * 4, 107 * 4); 230 fft_reorder_swap<11>(inout, 103 * 4, 115 * 4); 231 fft_reorder_swap<11>(inout, 111 * 4, 123 * 4); 232 fft_reorder_swap_two<11>(inout, 119 * 4, 127 * 4); 233 } 234 235 template <typename T> 236 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>) 237 { 238 constexpr size_t bitrev = 2; 239 fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4); 240 fft_reorder_swap<7, bitrev>(inout, 1 * 4, 4 * 4); 241 fft_reorder_swap<7, bitrev>(inout, 3 * 4, 6 * 4); 242 fft_reorder_swap_two<7, bitrev>(inout, 5 * 4, 7 * 4); 243 } 244 245 template <typename T> 246 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */) 247 { 248 constexpr size_t bitrev = 4; 249 fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4); 250 fft_reorder_swap<8, bitrev>(inout, 1 * 4, 4 * 4); 251 fft_reorder_swap<8, bitrev>(inout, 2 * 4, 8 * 4); 252 fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); 253 fft_reorder_swap<8, bitrev>(inout, 6 * 4, 9 * 4); 254 fft_reorder_swap<8, bitrev>(inout, 7 * 4, 13 * 4); 255 fft_reorder_swap_two<8, bitrev>(inout, 10 * 4, 15 * 4); 256 fft_reorder_swap<8, bitrev>(inout, 11 * 4, 14 * 4); 257 } 258 259 template <typename T> 260 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */) 261 { 262 constexpr size_t bitrev = 2; 263 fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4); 264 fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4); 265 fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4); 266 fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); 267 fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4); 268 fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4); 269 fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4); 270 fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4); 271 } 272 273 template <typename T> 274 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>) 275 { 276 constexpr size_t bitrev = 2; 277 fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4); 278 fft_reorder_swap<9, bitrev>(inout, 1 * 4, 16 * 4); 279 fft_reorder_swap<9, bitrev>(inout, 2 * 4, 8 * 4); 280 fft_reorder_swap<9, bitrev>(inout, 3 * 4, 24 * 4); 281 fft_reorder_swap<9, bitrev>(inout, 5 * 4, 20 * 4); 282 fft_reorder_swap<9, bitrev>(inout, 6 * 4, 12 * 4); 283 fft_reorder_swap<9, bitrev>(inout, 7 * 4, 28 * 4); 284 fft_reorder_swap<9, bitrev>(inout, 9 * 4, 18 * 4); 285 fft_reorder_swap_two<9, bitrev>(inout, 10 * 4, 14 * 4); 286 fft_reorder_swap<9, bitrev>(inout, 11 * 4, 26 * 4); 287 fft_reorder_swap<9, bitrev>(inout, 13 * 4, 22 * 4); 288 fft_reorder_swap<9, bitrev>(inout, 15 * 4, 30 * 4); 289 fft_reorder_swap_two<9, bitrev>(inout, 17 * 4, 21 * 4); 290 fft_reorder_swap<9, bitrev>(inout, 19 * 4, 25 * 4); 291 fft_reorder_swap<9, bitrev>(inout, 23 * 4, 29 * 4); 292 fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4); 293 } 294 295 template <typename T> 296 KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */) 297 { 298 constexpr size_t bitrev = 2; 299 fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4); 300 fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4); 301 fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4); 302 fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4); 303 fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4); 304 fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4); 305 fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4); 306 fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4); 307 fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4); 308 fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4); 309 fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4); 310 fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4); 311 fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4); 312 fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4); 313 fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4); 314 fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4); 315 fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4); 316 fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4); 317 fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4); 318 fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4); 319 fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4); 320 fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4); 321 fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4); 322 fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4); 323 fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4); 324 fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4); 325 fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4); 326 fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4); 327 fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4); 328 fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4); 329 fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4); 330 fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4); 331 } 332 333 template <typename T, bool use_br2> 334 KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) 335 { 336 cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, 337 digitreverse<(use_br2 ? 2 : 4), 2>(value)); 338 } 339 340 template <typename T, bool use_br2> 341 KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>) 342 { 343 CMT_ASSUME(i != j); 344 const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4); 345 const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4); 346 cwrite_reordered(inout + j, vi, N4, cbool_t<use_br2>()); 347 cwrite_reordered(inout + i, vj, N4, cbool_t<use_br2>()); 348 } 349 350 template <typename T, bool use_table> 351 KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2, cbool_t<use_table>) 352 { 353 const size_t N = size_t(1) << log2n; 354 const size_t N4 = N / 4; 355 const size_t iend = N / 16 * 4 * 2; 356 constexpr size_t istep = 2 * 4; 357 const size_t jstep1 = (1 << (log2n - 5)) * 4 * 2; 358 const size_t jstep2 = size_t(size_t(1) << (log2n - 5)) * 4 * 2 - size_t(size_t(1) << (log2n - 6)) * 4 * 2; 359 T* io = ptr_cast<T>(inout); 360 361 for (size_t i = 0; i < iend;) 362 { 363 size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4, cbool<use_table>) << 3; 364 if (i >= j) 365 { 366 fft_reorder_swap_n4(io, i, j, N4, use_br2); 367 } 368 else 369 { 370 i += 4 * istep; 371 continue; 372 } 373 i += istep; 374 j = j + jstep1; 375 376 if (i >= j) 377 { 378 fft_reorder_swap_n4(io, i, j, N4, use_br2); 379 } 380 i += istep; 381 j = j - jstep2; 382 383 if (i >= j) 384 { 385 fft_reorder_swap_n4(io, i, j, N4, use_br2); 386 } 387 i += istep; 388 j = j + jstep1; 389 390 if (i >= j) 391 { 392 fft_reorder_swap_n4(io, i, j, N4, use_br2); 393 } 394 i += istep; 395 } 396 } 397 398 template <typename T> 399 KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) 400 { 401 if (log2n - 4 > bitrev_table_log2N) 402 { 403 fft_reorder(inout, log2n, ctrue, cfalse); 404 } 405 else 406 { 407 fft_reorder(inout, log2n, ctrue, ctrue); 408 } 409 } 410 411 template <typename T> 412 KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) 413 { 414 const size_t N = size_t(1) << log2n; 415 const size_t N4 = N / 4; 416 const size_t N16 = N * 2 / 16; 417 size_t iend = N16; 418 constexpr size_t istep = 2 * 4; 419 const size_t jstep = N / 64 * 4 * 2; 420 T* io = ptr_cast<T>(inout); 421 422 size_t i = 0; 423 CMT_PRAGMA_CLANG(clang loop unroll_count(2)) 424 for (; i < iend;) 425 { 426 size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; 427 428 if (i >= j) 429 fft_reorder_swap_n4(io, i, j, N4, use_br2); 430 i += istep * 4; 431 } 432 iend += N16; 433 CMT_PRAGMA_CLANG(clang loop unroll_count(2)) 434 for (; i < iend;) 435 { 436 size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; 437 438 fft_reorder_swap_n4(io, i, j, N4, use_br2); 439 440 i += istep; 441 j = j + jstep; 442 443 if (i >= j) 444 fft_reorder_swap_n4(io, i, j, N4, use_br2); 445 i += istep * 3; 446 } 447 iend += N16; 448 CMT_PRAGMA_CLANG(clang loop unroll_count(2)) 449 for (; i < iend;) 450 { 451 size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; 452 453 fft_reorder_swap_n4(io, i, j, N4, use_br2); 454 455 i += istep; 456 j = j + jstep; 457 458 fft_reorder_swap_n4(io, i, j, N4, use_br2); 459 460 i += istep; 461 j = j + jstep; 462 463 if (i >= j) 464 fft_reorder_swap_n4(io, i, j, N4, use_br2); 465 i += istep * 2; 466 } 467 iend += N16; 468 CMT_PRAGMA_CLANG(clang loop unroll_count(2)) 469 for (; i < iend;) 470 { 471 size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; 472 473 fft_reorder_swap_n4(io, i, j, N4, use_br2); 474 475 i += istep; 476 j = j + jstep; 477 478 fft_reorder_swap_n4(io, i, j, N4, use_br2); 479 480 i += istep; 481 j = j + jstep; 482 483 fft_reorder_swap_n4(io, i, j, N4, use_br2); 484 485 i += istep; 486 j = j + jstep; 487 488 if (i >= j) 489 fft_reorder_swap_n4(io, i, j, N4, use_br2); 490 i += istep; 491 } 492 } 493 } // namespace intrinsics 494 } // namespace CMT_ARCH_NAME 495 } // namespace kfr