kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit bd42f14af6247ae4ac578c8fd643be66b62d505d
parent 9d3794faed25e91d92c1f448335d2ce549d32b98
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Thu, 14 Dec 2023 00:39:36 +0000

ctranspose: cast ps/pd

Diffstat:
Minclude/kfr/dft/impl/fft-impl.hpp | 16++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp @@ -75,14 +75,14 @@ KFR_INTRINSIC vec<float, 32> ctranspose<4, float, 32>(const vec<float, 32>& v16) { cvec<float, 4> r0, r1, r2, r3; split(v16, r0, r1, r2, r3); - const __m256d t0 = _mm256_unpacklo_pd(r0.v, r1.v); - const __m256d t1 = _mm256_unpacklo_pd(r2.v, r3.v); - const __m256d t2 = _mm256_unpackhi_pd(r0.v, r1.v); - const __m256d t3 = _mm256_unpackhi_pd(r2.v, r3.v); - r0.v = _mm256_permute2f128_pd(t0, t1, 0x20); - r1.v = _mm256_permute2f128_pd(t2, t3, 0x20); - r2.v = _mm256_permute2f128_pd(t0, t1, 0x31); - r3.v = _mm256_permute2f128_pd(t2, t3, 0x31); + const __m256d t0 = _mm256_unpacklo_pd(_mm256_castps_pd(r0.v), _mm256_castps_pd(r1.v)); + const __m256d t1 = _mm256_unpacklo_pd(_mm256_castps_pd(r2.v), _mm256_castps_pd(r3.v)); + const __m256d t2 = _mm256_unpackhi_pd(_mm256_castps_pd(r0.v), _mm256_castps_pd(r1.v)); + const __m256d t3 = _mm256_unpackhi_pd(_mm256_castps_pd(r2.v), _mm256_castps_pd(r3.v)); + r0.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t0, t1, 0x20)); + r1.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t2, t3, 0x20)); + r2.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t0, t1, 0x31)); + r3.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t2, t3, 0x31)); return concat(r0, r1, r2, r3); } #endif