kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

function.hpp (16814B)


      1 /*
      2   Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com)
      3   This file is part of KFR
      4 
      5   KFR is free software: you can redistribute it and/or modify
      6   it under the terms of the GNU General Public License as published by
      7   the Free Software Foundation, either version 2 of the License, or
      8   (at your option) any later version.
      9 
     10   KFR is distributed in the hope that it will be useful,
     11   but WITHOUT ANY WARRANTY; without even the implied warranty of
     12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13   GNU General Public License for more details.
     14 
     15   You should have received a copy of the GNU General Public License
     16   along with KFR.
     17 
     18   If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
     19   Buying a commercial license is mandatory as soon as you develop commercial activities without
     20   disclosing the source code of your own applications.
     21   See https://www.kfrlib.com for details.
     22  */
     23 #pragma once
     24 
     25 #include "../shuffle.hpp"
     26 #include "../types.hpp"
     27 #include "../vec.hpp"
     28 
     29 CMT_PRAGMA_GNU(GCC diagnostic push)
     30 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
     31 
     32 namespace kfr
     33 {
     34 inline namespace CMT_ARCH_NAME
     35 {
     36 
     37 #define KFR_HANDLE_NOT_F_1(fn)                                                                               \
     38     template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)>                                           \
     39     KFR_INTRINSIC vec<flt_type<T>, N> fn(const vec<T, N>& a) CMT_NOEXCEPT                                    \
     40     {                                                                                                        \
     41         return intrinsics::fn(promoteto<flt_type<T>>(a));                                                    \
     42     }
     43 
     44 #define KFR_HANDLE_SCALAR(fn)                                                                                \
     45     template <typename T1, typename... Args, typename Tout = std::common_type_t<T1, Args...>,                \
     46               KFR_ENABLE_IF(!(is_vec<T1> || (is_vec<Args> || ...)))>                                         \
     47     KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT                                        \
     48     {                                                                                                        \
     49         using vecout = vec1<Tout>;                                                                           \
     50         return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
     51     }
     52 
     53 #define KFR_HANDLE_SCALAR_1_T(fn, Tout)                                                                      \
     54     template <typename T1, typename... Args, typename T = std::common_type_t<T1, Args...>,                   \
     55               KFR_ENABLE_IF(!(is_vec<T1> || (is_vec<Args> || ...)))>                                         \
     56     KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT                                        \
     57     {                                                                                                        \
     58         using vecout = vec1<Tout>;                                                                           \
     59         return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
     60     }
     61 
     62 #define KFR_HANDLE_ARGS_T(fn, Tout)                                                                          \
     63     template <typename T1, typename... Args, typename T = std::common_type_t<T1, Args...>,                   \
     64               KFR_ENABLE_IF((is_vec<T1> || (is_vec<Args> || ...)))>                                          \
     65     KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT                                        \
     66     {                                                                                                        \
     67         using vecout = vec1<Tout>;                                                                           \
     68         return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...));                                    \
     69     }
     70 
     71 namespace intrinsics
     72 {
     73 #ifdef CMT_ARCH_X86
     74 using f32sse = vec<f32, 4>;
     75 using f64sse = vec<f64, 2>;
     76 using i8sse  = vec<i8, 16>;
     77 using i16sse = vec<i16, 8>;
     78 using i32sse = vec<i32, 4>;
     79 using i64sse = vec<i64, 2>;
     80 using u8sse  = vec<u8, 16>;
     81 using u16sse = vec<u16, 8>;
     82 using u32sse = vec<u32, 4>;
     83 using u64sse = vec<u64, 2>;
     84 
     85 using f32avx = vec<f32, 8>;
     86 using f64avx = vec<f64, 4>;
     87 using i8avx  = vec<i8, 32>;
     88 using i16avx = vec<i16, 16>;
     89 using i32avx = vec<i32, 8>;
     90 using i64avx = vec<i64, 4>;
     91 using u8avx  = vec<u8, 32>;
     92 using u16avx = vec<u16, 16>;
     93 using u32avx = vec<u32, 8>;
     94 using u64avx = vec<u64, 4>;
     95 
     96 using f32avx512 = vec<f32, 16>;
     97 using f64avx512 = vec<f64, 8>;
     98 using i8avx512  = vec<i8, 64>;
     99 using i16avx512 = vec<i16, 32>;
    100 using i32avx512 = vec<i32, 16>;
    101 using i64avx512 = vec<i64, 8>;
    102 using u8avx512  = vec<u8, 64>;
    103 using u16avx512 = vec<u16, 32>;
    104 using u32avx512 = vec<u32, 16>;
    105 using u64avx512 = vec<u64, 8>;
    106 
    107 using mf32sse = mask<f32, 4>;
    108 using mf64sse = mask<f64, 2>;
    109 using mi8sse  = mask<i8, 16>;
    110 using mi16sse = mask<i16, 8>;
    111 using mi32sse = mask<i32, 4>;
    112 using mi64sse = mask<i64, 2>;
    113 using mu8sse  = mask<u8, 16>;
    114 using mu16sse = mask<u16, 8>;
    115 using mu32sse = mask<u32, 4>;
    116 using mu64sse = mask<u64, 2>;
    117 
    118 using mf32avx = mask<f32, 8>;
    119 using mf64avx = mask<f64, 4>;
    120 using mi8avx  = mask<i8, 32>;
    121 using mi16avx = mask<i16, 16>;
    122 using mi32avx = mask<i32, 8>;
    123 using mi64avx = mask<i64, 4>;
    124 using mu8avx  = mask<u8, 32>;
    125 using mu16avx = mask<u16, 16>;
    126 using mu32avx = mask<u32, 8>;
    127 using mu64avx = mask<u64, 4>;
    128 
    129 using mf32avx512 = mask<f32, 16>;
    130 using mf64avx512 = mask<f64, 8>;
    131 using mi8avx512  = mask<i8, 64>;
    132 using mi16avx512 = mask<i16, 32>;
    133 using mi32avx512 = mask<i32, 16>;
    134 using mi64avx512 = mask<i64, 8>;
    135 using mu8avx512  = mask<u8, 64>;
    136 using mu16avx512 = mask<u16, 32>;
    137 using mu32avx512 = mask<u32, 16>;
    138 using mu64avx512 = mask<u64, 8>;
    139 
    140 #else
    141 using f32neon = vec<f32, 4>;
    142 using f64neon = vec<f64, 2>;
    143 using i8neon  = vec<i8, 16>;
    144 using i16neon = vec<i16, 8>;
    145 using i32neon = vec<i32, 4>;
    146 using i64neon = vec<i64, 2>;
    147 using u8neon  = vec<u8, 16>;
    148 using u16neon = vec<u16, 8>;
    149 using u32neon = vec<u32, 4>;
    150 using u64neon = vec<u64, 2>;
    151 
    152 using mf32neon = mask<f32, 4>;
    153 using mf64neon = mask<f64, 2>;
    154 using mi8neon  = mask<i8, 16>;
    155 using mi16neon = mask<i16, 8>;
    156 using mi32neon = mask<i32, 4>;
    157 using mi64neon = mask<i64, 2>;
    158 using mu8neon  = mask<u8, 16>;
    159 using mu16neon = mask<u16, 8>;
    160 using mu32neon = mask<u32, 4>;
    161 using mu64neon = mask<u64, 2>;
    162 #endif
    163 
    164 template <typename T>
    165 constexpr inline size_t next_simd_width(size_t n) CMT_NOEXCEPT
    166 {
    167     return n < minimum_vector_width<T> ? minimum_vector_width<T> : next_poweroftwo(n);
    168 }
    169 
    170 template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
    171 KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, 1>& x) CMT_NOEXCEPT
    172 {
    173     return broadcast<Nout>(x);
    174 }
    175 
    176 template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
    177 KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x) CMT_NOEXCEPT
    178 {
    179     return extend<Nout>(x);
    180 }
    181 
    182 template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
    183 KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) CMT_NOEXCEPT
    184 {
    185     return widen<Nout>(x, value);
    186 }
    187 
    188 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
    189 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
    190                           Fn&& fn)
    191 {
    192     result = fn(a, b, c);
    193 }
    194 
    195 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
    196 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
    197                           Fn&& fn)
    198 {
    199     intrin(result.h.low, a.h.low, b.h.low, c.h.low, fn);
    200     intrin(result.h.high, a.h.high, b.h.high, c.h.high, fn);
    201 }
    202 
    203 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
    204 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn)
    205 {
    206     result = fn(a);
    207 }
    208 
    209 template <typename T, size_t Nvec = vector_width<T>, size_t N, typename Fn, KFR_ENABLE_IF(N > Nvec)>
    210 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn)
    211 {
    212     intrin(result.h.low, a.h.low, fn);
    213     intrin(result.h.high, a.h.high, fn);
    214 }
    215 
    216 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
    217 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn)
    218 {
    219     result = fn(a, b);
    220 }
    221 
    222 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
    223 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn)
    224 {
    225     intrin(result.h.low, a.h.low, b.h.low, fn);
    226     intrin(result.h.high, a.h.high, b.h.high, fn);
    227 }
    228 
    229 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
    230 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn)
    231 {
    232     result = fn(a, b);
    233 }
    234 
    235 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
    236 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn)
    237 {
    238     intrin(result.h.low, a.h.low, b, fn);
    239     intrin(result.h.high, a.h.high, b, fn);
    240 }
    241 
    242 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
    243 KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn)
    244 {
    245     result = fn(a, b);
    246 }
    247 
    248 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
    249 KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn)
    250 {
    251     intrin(result.h.low, a, b.h.low, fn);
    252     intrin(result.h.high, a, b.h.high, fn);
    253 }
    254 
    255 #define KFR_HANDLE_ALL_SIZES_1_IF(fn, cond)                                                                  \
    256     template <typename T, size_t N,                                                                          \
    257               KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T> && cond)>          \
    258     KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT                                              \
    259     {                                                                                                        \
    260         constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
    261         return intrinsics::fn(a.shuffle(csizeseq<Nout>)).shuffle(csizeseq<N>);                               \
    262     }                                                                                                        \
    263     template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T> && cond),           \
    264               typename = void>                                                                               \
    265     KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT                                              \
    266     {                                                                                                        \
    267         vec<T, N> r;                                                                                         \
    268         intrin(r, a, [](const auto& x) { return intrinsics::fn(x); });                                       \
    269         return r;                                                                                            \
    270     }
    271 
    272 #define KFR_HANDLE_ALL_SIZES_1(fn) KFR_HANDLE_ALL_SIZES_1_IF(fn, true)
    273 
    274 #define KFR_HANDLE_ALL_SIZES_2(fn)                                                                           \
    275     template <typename T, size_t N,                                                                          \
    276               KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>)>                  \
    277     KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT                          \
    278     {                                                                                                        \
    279         constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
    280         return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), b.shuffle(csizeseq_t<Nout>()))                  \
    281             .shuffle(csizeseq<N>);                                                                           \
    282     }                                                                                                        \
    283     template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>), typename = void>  \
    284     KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT                          \
    285     {                                                                                                        \
    286         vec<T, N> r;                                                                                         \
    287         intrin(r, a, b, [](const auto& aa, const auto& bb) { return intrinsics::fn(aa, bb); });              \
    288         return r;                                                                                            \
    289     }                                                                                                        \
    290     template <typename T, size_t N,                                                                          \
    291               KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>)>                  \
    292     KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT                                  \
    293     {                                                                                                        \
    294         constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
    295         return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), vec<T, Nout>(b)).shuffle(csizeseq<N>);          \
    296     }                                                                                                        \
    297     template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>), typename = void>  \
    298     KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT                                  \
    299     {                                                                                                        \
    300         vec<T, N> r;                                                                                         \
    301         intrin(r, a, b, [](const auto& aa, const auto& bb) { return intrinsics::fn(aa, bb); });              \
    302         return r;                                                                                            \
    303     }                                                                                                        \
    304     template <typename T, size_t N,                                                                          \
    305               KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>)>                  \
    306     KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT                                  \
    307     {                                                                                                        \
    308         constexpr size_t Nout = intrinsics::next_simd_width<T>(N);                                           \
    309         return intrinsics::fn(vec<T, Nout>(a), b.shuffle(csizeseq_t<Nout>())).shuffle(csizeseq<N>);          \
    310     }                                                                                                        \
    311     template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>), typename = void>  \
    312     KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT                                  \
    313     {                                                                                                        \
    314         vec<T, N> r;                                                                                         \
    315         intrin(r, a, b, [](const auto& aa, const auto& bb) { return intrinsics::fn(aa, bb); });              \
    316         return r;                                                                                            \
    317     }
    318 
    319 template <typename T>
    320 using vec1 = std::conditional_t<is_vec<T>, T, vec<T, 1>>;
    321 
    322 template <typename T>
    323 inline const T& to_scalar(const T& value) CMT_NOEXCEPT
    324 {
    325     return value;
    326 }
    327 template <typename T>
    328 inline T to_scalar(const vec<T, 1>& value) CMT_NOEXCEPT
    329 {
    330     return value[0];
    331 }
    332 } // namespace intrinsics
    333 } // namespace CMT_ARCH_NAME
    334 } // namespace kfr
    335 CMT_PRAGMA_GNU(GCC diagnostic pop)