function.hpp (16814B)
1 /* 2 Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) 3 This file is part of KFR 4 5 KFR is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 2 of the License, or 8 (at your option) any later version. 9 10 KFR is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with KFR. 17 18 If GPL is not suitable for your project, you must purchase a commercial license to use KFR. 19 Buying a commercial license is mandatory as soon as you develop commercial activities without 20 disclosing the source code of your own applications. 21 See https://www.kfrlib.com for details. 22 */ 23 #pragma once 24 25 #include "../shuffle.hpp" 26 #include "../types.hpp" 27 #include "../vec.hpp" 28 29 CMT_PRAGMA_GNU(GCC diagnostic push) 30 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") 31 32 namespace kfr 33 { 34 inline namespace CMT_ARCH_NAME 35 { 36 37 #define KFR_HANDLE_NOT_F_1(fn) \ 38 template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>)> \ 39 KFR_INTRINSIC vec<flt_type<T>, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \ 40 { \ 41 return intrinsics::fn(promoteto<flt_type<T>>(a)); \ 42 } 43 44 #define KFR_HANDLE_SCALAR(fn) \ 45 template <typename T1, typename... Args, typename Tout = std::common_type_t<T1, Args...>, \ 46 KFR_ENABLE_IF(!(is_vec<T1> || (is_vec<Args> || ...)))> \ 47 KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \ 48 { \ 49 using vecout = vec1<Tout>; \ 50 return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ 51 } 52 53 #define KFR_HANDLE_SCALAR_1_T(fn, Tout) \ 54 template <typename T1, typename... Args, typename T = std::common_type_t<T1, Args...>, \ 55 KFR_ENABLE_IF(!(is_vec<T1> || (is_vec<Args> || ...)))> \ 56 KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \ 57 { \ 58 using vecout = vec1<Tout>; \ 59 return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ 60 } 61 62 #define KFR_HANDLE_ARGS_T(fn, Tout) \ 63 template <typename T1, typename... Args, typename T = std::common_type_t<T1, Args...>, \ 64 KFR_ENABLE_IF((is_vec<T1> || (is_vec<Args> || ...)))> \ 65 KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \ 66 { \ 67 using vecout = vec1<Tout>; \ 68 return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \ 69 } 70 71 namespace intrinsics 72 { 73 #ifdef CMT_ARCH_X86 74 using f32sse = vec<f32, 4>; 75 using f64sse = vec<f64, 2>; 76 using i8sse = vec<i8, 16>; 77 using i16sse = vec<i16, 8>; 78 using i32sse = vec<i32, 4>; 79 using i64sse = vec<i64, 2>; 80 using u8sse = vec<u8, 16>; 81 using u16sse = vec<u16, 8>; 82 using u32sse = vec<u32, 4>; 83 using u64sse = vec<u64, 2>; 84 85 using f32avx = vec<f32, 8>; 86 using f64avx = vec<f64, 4>; 87 using i8avx = vec<i8, 32>; 88 using i16avx = vec<i16, 16>; 89 using i32avx = vec<i32, 8>; 90 using i64avx = vec<i64, 4>; 91 using u8avx = vec<u8, 32>; 92 using u16avx = vec<u16, 16>; 93 using u32avx = vec<u32, 8>; 94 using u64avx = vec<u64, 4>; 95 96 using f32avx512 = vec<f32, 16>; 97 using f64avx512 = vec<f64, 8>; 98 using i8avx512 = vec<i8, 64>; 99 using i16avx512 = vec<i16, 32>; 100 using i32avx512 = vec<i32, 16>; 101 using i64avx512 = vec<i64, 8>; 102 using u8avx512 = vec<u8, 64>; 103 using u16avx512 = vec<u16, 32>; 104 using u32avx512 = vec<u32, 16>; 105 using u64avx512 = vec<u64, 8>; 106 107 using mf32sse = mask<f32, 4>; 108 using mf64sse = mask<f64, 2>; 109 using mi8sse = mask<i8, 16>; 110 using mi16sse = mask<i16, 8>; 111 using mi32sse = mask<i32, 4>; 112 using mi64sse = mask<i64, 2>; 113 using mu8sse = mask<u8, 16>; 114 using mu16sse = mask<u16, 8>; 115 using mu32sse = mask<u32, 4>; 116 using mu64sse = mask<u64, 2>; 117 118 using mf32avx = mask<f32, 8>; 119 using mf64avx = mask<f64, 4>; 120 using mi8avx = mask<i8, 32>; 121 using mi16avx = mask<i16, 16>; 122 using mi32avx = mask<i32, 8>; 123 using mi64avx = mask<i64, 4>; 124 using mu8avx = mask<u8, 32>; 125 using mu16avx = mask<u16, 16>; 126 using mu32avx = mask<u32, 8>; 127 using mu64avx = mask<u64, 4>; 128 129 using mf32avx512 = mask<f32, 16>; 130 using mf64avx512 = mask<f64, 8>; 131 using mi8avx512 = mask<i8, 64>; 132 using mi16avx512 = mask<i16, 32>; 133 using mi32avx512 = mask<i32, 16>; 134 using mi64avx512 = mask<i64, 8>; 135 using mu8avx512 = mask<u8, 64>; 136 using mu16avx512 = mask<u16, 32>; 137 using mu32avx512 = mask<u32, 16>; 138 using mu64avx512 = mask<u64, 8>; 139 140 #else 141 using f32neon = vec<f32, 4>; 142 using f64neon = vec<f64, 2>; 143 using i8neon = vec<i8, 16>; 144 using i16neon = vec<i16, 8>; 145 using i32neon = vec<i32, 4>; 146 using i64neon = vec<i64, 2>; 147 using u8neon = vec<u8, 16>; 148 using u16neon = vec<u16, 8>; 149 using u32neon = vec<u32, 4>; 150 using u64neon = vec<u64, 2>; 151 152 using mf32neon = mask<f32, 4>; 153 using mf64neon = mask<f64, 2>; 154 using mi8neon = mask<i8, 16>; 155 using mi16neon = mask<i16, 8>; 156 using mi32neon = mask<i32, 4>; 157 using mi64neon = mask<i64, 2>; 158 using mu8neon = mask<u8, 16>; 159 using mu16neon = mask<u16, 8>; 160 using mu32neon = mask<u32, 4>; 161 using mu64neon = mask<u64, 2>; 162 #endif 163 164 template <typename T> 165 constexpr inline size_t next_simd_width(size_t n) CMT_NOEXCEPT 166 { 167 return n < minimum_vector_width<T> ? minimum_vector_width<T> : next_poweroftwo(n); 168 } 169 170 template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)> 171 KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, 1>& x) CMT_NOEXCEPT 172 { 173 return broadcast<Nout>(x); 174 } 175 176 template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)> 177 KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x) CMT_NOEXCEPT 178 { 179 return extend<Nout>(x); 180 } 181 182 template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)> 183 KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) CMT_NOEXCEPT 184 { 185 return widen<Nout>(x, value); 186 } 187 188 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> 189 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, 190 Fn&& fn) 191 { 192 result = fn(a, b, c); 193 } 194 195 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> 196 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, 197 Fn&& fn) 198 { 199 intrin(result.h.low, a.h.low, b.h.low, c.h.low, fn); 200 intrin(result.h.high, a.h.high, b.h.high, c.h.high, fn); 201 } 202 203 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> 204 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn) 205 { 206 result = fn(a); 207 } 208 209 template <typename T, size_t Nvec = vector_width<T>, size_t N, typename Fn, KFR_ENABLE_IF(N > Nvec)> 210 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn) 211 { 212 intrin(result.h.low, a.h.low, fn); 213 intrin(result.h.high, a.h.high, fn); 214 } 215 216 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> 217 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn) 218 { 219 result = fn(a, b); 220 } 221 222 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> 223 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn) 224 { 225 intrin(result.h.low, a.h.low, b.h.low, fn); 226 intrin(result.h.high, a.h.high, b.h.high, fn); 227 } 228 229 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> 230 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn) 231 { 232 result = fn(a, b); 233 } 234 235 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> 236 KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn) 237 { 238 intrin(result.h.low, a.h.low, b, fn); 239 intrin(result.h.high, a.h.high, b, fn); 240 } 241 242 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)> 243 KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn) 244 { 245 result = fn(a, b); 246 } 247 248 template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)> 249 KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn) 250 { 251 intrin(result.h.low, a, b.h.low, fn); 252 intrin(result.h.high, a, b.h.high, fn); 253 } 254 255 #define KFR_HANDLE_ALL_SIZES_1_IF(fn, cond) \ 256 template <typename T, size_t N, \ 257 KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T> && cond)> \ 258 KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \ 259 { \ 260 constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ 261 return intrinsics::fn(a.shuffle(csizeseq<Nout>)).shuffle(csizeseq<N>); \ 262 } \ 263 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T> && cond), \ 264 typename = void> \ 265 KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \ 266 { \ 267 vec<T, N> r; \ 268 intrin(r, a, [](const auto& x) { return intrinsics::fn(x); }); \ 269 return r; \ 270 } 271 272 #define KFR_HANDLE_ALL_SIZES_1(fn) KFR_HANDLE_ALL_SIZES_1_IF(fn, true) 273 274 #define KFR_HANDLE_ALL_SIZES_2(fn) \ 275 template <typename T, size_t N, \ 276 KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>)> \ 277 KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT \ 278 { \ 279 constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ 280 return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), b.shuffle(csizeseq_t<Nout>())) \ 281 .shuffle(csizeseq<N>); \ 282 } \ 283 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>), typename = void> \ 284 KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT \ 285 { \ 286 vec<T, N> r; \ 287 intrin(r, a, b, [](const auto& aa, const auto& bb) { return intrinsics::fn(aa, bb); }); \ 288 return r; \ 289 } \ 290 template <typename T, size_t N, \ 291 KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>)> \ 292 KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT \ 293 { \ 294 constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ 295 return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), vec<T, Nout>(b)).shuffle(csizeseq<N>); \ 296 } \ 297 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>), typename = void> \ 298 KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT \ 299 { \ 300 vec<T, N> r; \ 301 intrin(r, a, b, [](const auto& aa, const auto& bb) { return intrinsics::fn(aa, bb); }); \ 302 return r; \ 303 } \ 304 template <typename T, size_t N, \ 305 KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>)> \ 306 KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT \ 307 { \ 308 constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \ 309 return intrinsics::fn(vec<T, Nout>(a), b.shuffle(csizeseq_t<Nout>())).shuffle(csizeseq<N>); \ 310 } \ 311 template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>), typename = void> \ 312 KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT \ 313 { \ 314 vec<T, N> r; \ 315 intrin(r, a, b, [](const auto& aa, const auto& bb) { return intrinsics::fn(aa, bb); }); \ 316 return r; \ 317 } 318 319 template <typename T> 320 using vec1 = std::conditional_t<is_vec<T>, T, vec<T, 1>>; 321 322 template <typename T> 323 inline const T& to_scalar(const T& value) CMT_NOEXCEPT 324 { 325 return value; 326 } 327 template <typename T> 328 inline T to_scalar(const vec<T, 1>& value) CMT_NOEXCEPT 329 { 330 return value[0]; 331 } 332 } // namespace intrinsics 333 } // namespace CMT_ARCH_NAME 334 } // namespace kfr 335 CMT_PRAGMA_GNU(GCC diagnostic pop)