commit b6320ef16497bcbfe26f0bd107c3f4b9ca3278a3
parent da99a8186349038c9d15c3e3f15a1b7f6b5975d3
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Thu, 21 Feb 2019 01:26:26 +0000
KFR 3.0.5
Diffstat:
325 files changed, 21331 insertions(+), 15841 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -77,7 +77,7 @@ var/
venv/
-# Sphinx documentation
+# Documentation
docs/
mkdocs/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,26 @@
# Changelog
+## 3.0.5
+
+2019-02-21
+
+#### Added
+
+- DFT speeds have been improved by up to 15% on most modern cpus
+- Support for MSVC 2017
+- Support for GCC 7.3
+- Support for GCC 8.2
+- Support for resampling complex vectors (Thanks to https://github.com/ermito)
+- Tests for various math functions no longer depend on MPFR
+
+#### Changed
+
+- Testo now allocates much less memory during long tests (x3 less than previously)
+
+#### Fixed
+
+- Building generators (Thanks to https://github.com/ermito)
+
## 3.0.4
2019-01-08
@@ -9,6 +30,7 @@
#### Changed
- KFR_READCYCLECOUNTER may be redefined to point to any function returning (pseudo-)random value
+- Ability to disable random number initialization functions
#### Fixed
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,15 +15,33 @@
# along with KFR.
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
+
+message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS})
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS}" CACHE STRING "compile flags" FORCE)
+message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS})
+
project(kfr CXX)
-message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER} ")
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} ${CMAKE_CXX_COMPILER} ")
message(STATUS CMAKE_SYSTEM_PROCESSOR = ${CMAKE_SYSTEM_PROCESSOR})
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+ set (X86 TRUE)
+else ()
+ set (X86 FALSE)
+endif ()
+
+if (X86)
+ message(STATUS X86)
+endif ()
+
if (MSVC)
message(STATUS MSVC)
endif()
@@ -34,77 +52,128 @@ else()
set(CLANG 0)
endif()
-# Include list of source files
+# Include autogenerated list of source files
include(sources.cmake)
-add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
+option(ENABLE_TESTS "Enable tests and examples" OFF)
+if (CLANG)
+ option(ENABLE_DFT "Enable DFT and related algorithms. Requires Clang" ON)
+endif ()
+option(ENABLE_ASMTEST "Enable writing disassembly" OFF)
+option(REGENERATE_TESTS "Regenerate auto tests" OFF)
+option(DISABLE_CLANG_EXTENSIONS "Disable Clang vector extensions" OFF)
+option(KFR_EXTENDED_TESTS "Extended tests (up to hour)" OFF)
+mark_as_advanced(ENABLE_ASMTEST)
+mark_as_advanced(REGENERATE_TESTS)
+mark_as_advanced(DISABLE_CLANG_EXTENSIONS)
+
+if (NOT CPU_ARCH)
+ set(CPU_ARCH avx2)
+endif ()
-option(ENABLE_TESTS "Enable tests and examples. This changes many compiler flags" OFF)
-option(ENABLE_DFT "Enable DFT and related algorithms" ON)
+if (CPU_ARCH STREQUAL "detect")
+ message(STATUS "Detecting native cpu...")
+ try_run(
+ RUN_RESULT COMPILE_RESULT
+ "${CMAKE_BINARY_DIR}/tmpdir"
+ ${CMAKE_SOURCE_DIR}/cmake/detect_cpu.cpp
+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_SOURCE_DIR}/include"
+ COMPILE_OUTPUT_VARIABLE COMPILE_OUT
+ RUN_OUTPUT_VARIABLE RUN_OUT
+ )
+ if (COMPILE_RESULT AND RUN_RESULT EQUAL 0)
+ message(STATUS DETECTED_CPU = ${RUN_OUT})
+ set(CPU_ARCH ${RUN_OUT})
+ else()
+ message(STATUS COMPILE_RESULT = ${COMPILE_RESULT})
+ message(STATUS RUN_RESULT = ${RUN_RESULT})
+ message(STATUS COMPILE_OUT = ${COMPILE_OUT})
+ message(STATUS RUN_OUT = ${RUN_OUT})
+ endif ()
+endif ()
-set(KFR_DFT_SRC
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-src.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/dft_c.h
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp)
+include(cmake/target_set_arch.cmake)
-set(KFR_IO_SRC
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp)
+add_library(use_arch INTERFACE)
+target_set_arch(use_arch INTERFACE ${CPU_ARCH})
-if (ENABLE_TESTS)
+if (WIN32)
+ add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+ add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
+endif()
- if (IOS)
- set(STD_LIB)
- else ()
- set(STD_LIB stdc++)
- endif ()
+if (IOS)
+ set(STD_LIB)
+else ()
+ set(STD_LIB stdc++)
+endif ()
+
+# KFR library
+add_library(kfr INTERFACE)
+target_sources(kfr INTERFACE ${KFR_SRC})
+target_include_directories(kfr INTERFACE include)
+target_compile_options(kfr INTERFACE "$<$<CONFIG:DEBUG>:-DKFR_DEBUG>")
+if (NOT MSVC)
+ target_compile_options(kfr INTERFACE -mstackrealign)
+endif ()
+if (MSVC)
+ target_compile_options(kfr INTERFACE -bigobj)
+else ()
+ target_link_libraries(kfr INTERFACE ${STD_LIB} pthread m)
+endif ()
+if (DISABLE_CLANG_EXTENSIONS)
+ target_compile_definitions(kfr INTERFACE -DCMT_DISABLE_CLANG_EXT)
+endif ()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ target_compile_options(kfr INTERFACE -Wno-ignored-qualifiers)
+endif ()
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ target_compile_options(kfr INTERFACE -Wno-c++1z-extensions)
+endif ()
- # Binary output directories
- set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
- set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
+if (NOT ENABLE_DFT)
+ target_compile_definitions(kfr INTERFACE -DKFR_NO_DFT)
+endif ()
+if (KFR_EXTENDED_TESTS)
+ target_compile_definitions(kfr INTERFACE -DKFR_EXTENDED_TESTS)
+endif()
- add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+message(STATUS CPU_ARCH=${CPU_ARCH})
- if (NOT MSVC OR CLANG)
- # Enable C++14, disable exceptions and rtti
- if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
- add_compile_options(-std=gnu++1y)
- else ()
- add_compile_options(-std=c++1y)
- endif ()
- add_compile_options(-fno-exceptions -fno-rtti )
- if (NOT ARCH_FLAGS)
- add_compile_options(-march=native)
- message(STATUS "Building for native cpu")
- if(WIN32)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mstackrealign -fno-asynchronous-unwind-tables")
- endif()
- else ()
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_FLAGS}")
- endif ()
- if(NOT MSVC)
- link_libraries(${STD_LIB} pthread m)
- endif()
- else ()
- # Disable exceptions
- add_compile_options(/EHsc /D_HAS_EXCEPTIONS=0 /D_CRT_SECURE_NO_WARNINGS=1)
- add_compile_options(/arch:AVX)
+if (ENABLE_TESTS)
+
+ if (MSVC)
+ else()
+ # disable exceptions and rtti
+ add_compile_options(-fno-exceptions -fno-rtti -fno-asynchronous-unwind-tables)
endif ()
add_subdirectory(examples)
add_subdirectory(tests)
+ add_subdirectory(tools)
endif ()
-add_library(kfr INTERFACE)
-target_sources(kfr INTERFACE ${KFR_SRC})
-target_include_directories(kfr INTERFACE include)
-
if (ENABLE_DFT)
+ if (NOT CLANG)
+ message(FATAL_ERROR "Clang compiler is required for DFT in KFR. See README.md for more information")
+ endif()
add_library(kfr_dft ${KFR_DFT_SRC})
- target_link_libraries(kfr_dft kfr)
+ target_link_libraries(kfr_dft kfr use_arch)
+ if (MSVC)
+ target_compile_options(kfr_dft PRIVATE -fp:fast)
+ else()
+ target_compile_options(kfr_dft PRIVATE -ffast-math)
+ endif()
endif()
add_library(kfr_io ${KFR_IO_SRC})
target_link_libraries(kfr_io kfr)
target_compile_definitions(kfr_io PUBLIC KFR_ENABLE_FLAC=1)
+
+install(TARGETS kfr kfr_io ARCHIVE DESTINATION lib)
+
+if (ENABLE_DFT)
+ install(TARGETS kfr_dft ARCHIVE DESTINATION lib)
+endif ()
+
+install(DIRECTORY include/kfr DESTINATION include)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -6,7 +6,7 @@ jobs:
- bash: |
set -e
sudo apt-get update && sudo apt-get install -y ninja-build libmpfr-dev
- ci/run.sh build-release -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+ ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
- job: Linux_x86_64_Clang_Debug
pool:
@@ -15,7 +15,7 @@ jobs:
- bash: |
set -e
sudo apt-get update && sudo apt-get install -y ninja-build libmpfr-dev
- ci/run.sh build-debug -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug
+ ci/run.sh build-debug -DCPU_ARCH=detect -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug
- job: Linux_ARM_Clang_Release
pool:
@@ -46,7 +46,7 @@ jobs:
set -e
/bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer"
brew install ninja
- ci/run.sh build-release -DCMAKE_BUILD_TYPE=Release
+ ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_BUILD_TYPE=Release
- job: macOS_x86_64_Clang_Debug
strategy:
@@ -62,7 +62,7 @@ jobs:
set -e
/bin/bash -c "sudo xcode-select -s /Applications/Xcode_$(XCODE_VER).app/Contents/Developer"
brew install ninja
- ci/run.sh build-release -DCMAKE_BUILD_TYPE=Release
+ ci/run.sh build-release -DCPU_ARCH=detect -DCMAKE_BUILD_TYPE=Release
- job: Windows_MSVC_x86_64_Clang_Release
pool:
@@ -73,7 +73,7 @@ jobs:
call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
set PATH=%PATH:C:\tools\mingw64\bin;=%
set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=%
- ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release
+ ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release
- job: Windows_MSVC_x86_Clang_Release
pool:
@@ -84,7 +84,7 @@ jobs:
call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars32.bat"
set PATH=%PATH:C:\tools\mingw64\bin;=%
set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=%
- ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release
+ ci\run.cmd build-release -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Release
- job: Windows_MSVC_x86_Clang_Debug
pool:
@@ -95,32 +95,34 @@ jobs:
call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise\VC\Auxiliary\Build\vcvars32.bat"
set PATH=%PATH:C:\tools\mingw64\bin;=%
set PATH=%PATH:C:\Program Files\Git\mingw64\bin;=%
- ci\run.cmd build-debug -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DARCH_FLAGS=-mavx -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Debug
+ ci\run.cmd build-debug -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCPU_ARCH=detect -DCMAKE_CXX_FLAGS=-m32 -DCMAKE_BUILD_TYPE=Debug
- job: Windows_MinGW_x86_64_AVX512_Clang_Release
pool: WIN-AVX512
steps:
- script: |
set PATH=C:\msys64\mingw64\bin;C:\msys64\usr\local\bin;C:\msys64\usr\bin;%PATH%
- bash -c "ci/run.sh build-release -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Release"
+ bash -c "ci/run.sh build-release -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Release"
- job: Windows_MinGW_x86_64_AVX512_Clang_Debug
pool: WIN-AVX512
steps:
- script: |
set PATH=C:\msys64\mingw64\bin;C:\msys64\usr\local\bin;C:\msys64\usr\bin;%PATH%
- bash -c "ci/run.sh build-debug -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Debug"
-
+ bash -c "ci/run.sh build-debug -DCMAKE_CXX_COMPILER=/c/LLVM/bin/clang++.exe -DCPU_ARCH=avx512 -DCMAKE_CXX_FLAGS=--target=x86_64-w64-windows-gnu -DCMAKE_BUILD_TYPE=Debug"
+
- job: Windows_MSVC_x86_64_AVX512_Clang_Release
pool: WIN-AVX512
steps:
- script: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
- ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DARCH_FLAGS="-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Release
+ set CXXFLAGS=-m64
+ ci\run.cmd build-release -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DCPU_ARCH=avx512 -DCMAKE_BUILD_TYPE=Release
- job: Windows_MSVC_x86_64_AVX512_Clang_Debug
pool: WIN-AVX512
steps:
- script: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
- ci\run.cmd build-debug -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DARCH_FLAGS="-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl" -DCMAKE_CXX_FLAGS=-m64 -DCMAKE_BUILD_TYPE=Debug
+ set CXXFLAGS=-m64
+ ci\run.cmd build-debug -DARCH_TESTS=ON -DCMAKE_CXX_COMPILER="C:/LLVM/bin/clang-cl.exe" -DCPU_ARCH=avx512 -DCMAKE_BUILD_TYPE=Debug
diff --git a/cmake/arm.cmake b/cmake/arm.cmake
@@ -11,7 +11,9 @@ set (CMAKE_CXX_COMPILER_WORKS TRUE)
set (CMAKE_C_COMPILER_WORKS TRUE)
set (ARM_ROOT "/usr/arm-linux-gnueabihf/include")
-set (GCC_VER 5.4.0)
+if (NOT GCC_VER)
+ set (GCC_VER 5.4.0)
+endif ()
set (SYS_PATHS "-isystem ${ARM_ROOT}/c++/${GCC_VER} -isystem ${ARM_ROOT}/c++/${GCC_VER}/backward -isystem ${ARM_ROOT}/c++/${GCC_VER}/arm-linux-gnueabihf -isystem ${ARM_ROOT}")
set (ARM_COMMON_FLAGS "-target arm-linux-gnueabihf -mcpu=cortex-a15 -mfpu=neon-vfpv4 -mfloat-abi=hard -static")
diff --git a/cmake/detect_cpu.cpp b/cmake/detect_cpu.cpp
@@ -0,0 +1,9 @@
+#include <kfr/runtime/cpuid.hpp>
+
+using namespace kfr;
+
+int main()
+{
+ cpu_t cpu = kfr::internal_generic::detect_cpu();
+ printf("%s", cpu_name(cpu));
+}
+\ No newline at end of file
diff --git a/cmake/target_set_arch.cmake b/cmake/target_set_arch.cmake
@@ -0,0 +1,56 @@
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+
+ set(ARCH_FLAGS_GNU_generic -DCMT_FORCE_GENERIC_CPU)
+ set(ARCH_FLAGS_GNU_sse2 -msse2)
+ set(ARCH_FLAGS_GNU_sse3 -msse3)
+ set(ARCH_FLAGS_GNU_ssse3 -mssse3)
+ set(ARCH_FLAGS_GNU_sse41 -msse4.1)
+ set(ARCH_FLAGS_GNU_avx -msse4.1 -mavx)
+ set(ARCH_FLAGS_GNU_avx2 -msse4.1 -mavx2 -mfma)
+ set(ARCH_FLAGS_GNU_avx512 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl)
+
+ if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+ # SSE2 is part of x86_64
+ set(ARCH_FLAG_MS_SSE2)
+ else()
+ set(ARCH_FLAG_MS_SSE2 -arch:SSE2)
+ endif()
+
+ set(ARCH_FLAGS_MS_generic ${ARCH_FLAG_MS_SSE2} -DCMT_FORCE_GENERIC_CPU)
+ set(ARCH_FLAGS_MS_sse2 ${ARCH_FLAG_MS_SSE2})
+ set(ARCH_FLAGS_MS_sse3 ${ARCH_FLAG_MS_SSE2} -D__SSE3__)
+ set(ARCH_FLAGS_MS_ssse3 ${ARCH_FLAG_MS_SSE2} -D__SSSE3__)
+ set(ARCH_FLAGS_MS_sse41 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__)
+ set(ARCH_FLAGS_MS_avx -arch:AVX)
+ set(ARCH_FLAGS_MS_avx2 -arch:AVX2)
+ set(ARCH_FLAGS_MS_avx512 -arch:AVX512)
+
+ function(target_set_arch TARGET MODE ARCH)
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+ set(CLANG 1)
+ else ()
+ set(CLANG 0)
+ endif()
+ message(STATUS "target_set_arch(${TARGET} ${MODE} ${ARCH})")
+ if (CLANG OR NOT MSVC)
+ # Reset previous arch flags
+ if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+ target_compile_options(${TARGET} ${MODE} -mno-sse3)
+ else()
+ target_compile_options(${TARGET} ${MODE} -mno-sse)
+ endif()
+ endif ()
+ if (MSVC AND NOT CLANG)
+ target_compile_options(${TARGET} ${MODE} ${ARCH_FLAGS_MS_${ARCH}})
+ else()
+ target_compile_options(${TARGET} ${MODE} ${ARCH_FLAGS_GNU_${ARCH}})
+ endif ()
+ endfunction()
+
+else()
+
+ function(target_set_arch TARGET MODE ARCH)
+ endfunction()
+
+endif ()
diff --git a/cmake/test_toolset/CMakeLists.txt b/cmake/test_toolset/CMakeLists.txt
@@ -1,3 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(test_toolset CXX)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -15,33 +15,32 @@
# along with KFR.
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
-file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/svg)
+# Binary output directories
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
-include_directories(../include)
+file(MAKE_DIRECTORY ${PROJECT_BINARY_DIR}/svg)
add_executable(biquads biquads.cpp)
-target_link_libraries(biquads kfr)
+target_link_libraries(biquads kfr use_arch)
add_executable(window window.cpp)
-target_link_libraries(window kfr)
+target_link_libraries(window kfr use_arch)
add_executable(fir fir.cpp)
+target_link_libraries(fir kfr use_arch)
-target_link_libraries(fir kfr)
if (ENABLE_DFT)
- target_link_libraries(fir kfr_dft)
+ target_link_libraries(fir kfr_dft use_arch)
target_compile_definitions(fir PRIVATE -DHAVE_DFT)
endif ()
add_executable(sample_rate_conversion sample_rate_conversion.cpp)
-target_link_libraries(sample_rate_conversion kfr kfr_io)
-
-add_executable(sample_rate_converter sample_rate_converter.cpp)
-target_link_libraries(sample_rate_converter kfr kfr_io)
+target_link_libraries(sample_rate_conversion kfr kfr_io use_arch)
if (ENABLE_DFT)
add_executable(dft dft.cpp)
- target_link_libraries(dft kfr kfr_dft)
+ target_link_libraries(dft kfr kfr_dft use_arch)
endif ()
diff --git a/examples/biquads.cpp b/examples/biquads.cpp
@@ -94,5 +94,7 @@ int main()
plot_save("biquad_filter_lowpass", output,
options + ", title='Biquad Low pass filter (0.2, 0.9) (using biquad_filter)'");
+ println("SVG plots have been saved to svg directory");
+
return 0;
}
diff --git a/examples/fir.cpp b/examples/fir.cpp
@@ -148,5 +148,7 @@ int main()
#endif
#endif
+ println("SVG plots have been saved to svg directory");
+
return 0;
}
diff --git a/examples/sample_rate_conversion.cpp b/examples/sample_rate_conversion.cpp
@@ -72,5 +72,7 @@ int main()
plot_save("audio_draft_quality", "audio_draft_quality.wav", "");
}
+ println("SVG plots have been saved to svg directory");
+
return 0;
}
diff --git a/examples/window.cpp b/examples/window.cpp
@@ -57,5 +57,7 @@ int main()
output = window_kaiser(output.size(), 2.5);
plot_save("window_kaiser", output, options + ", title='Kaiser window'");
+ println("SVG plots have been saved to svg directory");
+
return 0;
}
diff --git a/include/kfr/all.hpp b/include/kfr/all.hpp
@@ -22,7 +22,6 @@
*/
#include "base.hpp"
-#include "cpuid.hpp"
#include "dft.hpp"
#include "dsp.hpp"
#include "io.hpp"
diff --git a/include/kfr/base.hpp b/include/kfr/base.hpp
@@ -22,44 +22,19 @@
*/
#pragma once
-#include "base/abs.hpp"
-#include "base/asin_acos.hpp"
-#include "base/atan.hpp"
+#include "math.hpp"
+
#include "base/basic_expressions.hpp"
-#include "base/clamp.hpp"
-#include "base/comparison.hpp"
-#include "base/compiletime.hpp"
-#include "base/complex.hpp"
-#include "base/constants.hpp"
#include "base/conversion.hpp"
-#include "base/digitreverse.hpp"
#include "base/expression.hpp"
#include "base/filter.hpp"
-#include "base/function.hpp"
-#include "base/gamma.hpp"
+#include "base/fraction.hpp"
+#include "base/function_expressions.hpp"
#include "base/generators.hpp"
-#include "base/horizontal.hpp"
-#include "base/hyperbolic.hpp"
-#include "base/log_exp.hpp"
-#include "base/logical.hpp"
#include "base/memory.hpp"
-#include "base/min_max.hpp"
-#include "base/modzerobessel.hpp"
-#include "base/operators.hpp"
#include "base/pointer.hpp"
#include "base/random.hpp"
-#include "base/read_write.hpp"
#include "base/reduce.hpp"
-#include "base/round.hpp"
-#include "base/saturation.hpp"
-#include "base/select.hpp"
-#include "base/shuffle.hpp"
-#include "base/sin_cos.hpp"
#include "base/small_buffer.hpp"
#include "base/sort.hpp"
-#include "base/sqrt.hpp"
-#include "base/tan.hpp"
-#include "base/types.hpp"
#include "base/univector.hpp"
-#include "base/vec.hpp"
-#include "version.hpp"
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -1,49 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/abs.hpp"
-
-namespace kfr
-{
-/**
- * @brief Returns the absolute value of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 abs(const T1& x)
-{
- return intrinsics::abs(x);
-}
-
-/**
- * @brief Returns template expression that returns the absolute value of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::abs, E1> abs(E1&& x)
-{
- return { fn::abs(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp
@@ -1,67 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/asin_acos.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the arc sine of x. The returned angle is in the range \f$-\pi/2\f$ through \f$\pi/2\f$.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> asin(const T1& x)
-{
- return intrinsics::asin(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc sine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::asin, E1> asin(E1&& x)
-{
- return { fn::asin(), std::forward<E1>(x) };
-}
-/**
- * @brief Returns the arc cosine of x. The returned angle is in the range 0 through \f$\pi\f$.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> acos(const T1& x)
-{
- return intrinsics::acos(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc cosine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::acos, E1> acos(E1&& x)
-{
- return { fn::acos(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -1,107 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/atan.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the arc tangent of x. The returned angle is in the range \f$-\pi/2\f$ through
- * \f$\pi/2\f$.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> atan(const T1& x)
-{
- return intrinsics::atan(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::atan, E1> atan(E1&& x)
-{
- return { fn::atan(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the arc tangent of the x, expressed in degrees. The returned angle is in the range -90
- * through 90.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> atandeg(const T1& x)
-{
- return intrinsics::atandeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of the x, expressed in degrees.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::atandeg, E1> atandeg(E1&& x)
-{
- return { fn::atandeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC common_type<T1, T2> atan2(const T1& x, const T2& y)
-{
- return intrinsics::atan2(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of y/x.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::atan2, E1, E2> atan2(E1&& x, E2&& y)
-{
- return { fn::atan2(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the arc tangent of y/x (expressed in degrees) using the signs of arguments to determine the
- * correct quadrant.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC common_type<T1, T2> atan2deg(const T1& x, const T2& y)
-{
- return intrinsics::atan2deg(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the arc tangent of y/x (expressed in degrees).
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::atan2deg, E1, E2> atan2deg(E1&& x, E2&& y)
-{
- return { fn::atan2deg(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/basic_expressions.hpp b/include/kfr/base/basic_expressions.hpp
@@ -25,27 +25,51 @@
*/
#pragma once
-#include "operators.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
#include "univector.hpp"
-#include "vec.hpp"
#include <algorithm>
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace internal
+{
+template <size_t width, typename Fn>
+KFR_INTRINSIC void block_process_impl(size_t& i, size_t size, Fn&& fn)
+{
+ CMT_LOOP_NOUNROLL
+ for (; i < size / width * width; i += width)
+ fn(i, csize_t<width>());
+}
+} // namespace internal
+
+template <size_t... widths, typename Fn>
+KFR_INTRINSIC void block_process(size_t size, csizes_t<widths...>, Fn&& fn)
+{
+ size_t i = 0;
+ swallow{ (internal::block_process_impl<widths>(i, size, std::forward<Fn>(fn)), 0)... };
+}
namespace internal
{
template <typename To, typename E>
-struct expression_convert : expression_base<E>
+struct expression_convert : expression_with_arguments<E>
{
using value_type = To;
- CMT_INLINE expression_convert(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {}
+ KFR_MEM_INTRINSIC expression_convert(E&& expr) CMT_NOEXCEPT
+ : expression_with_arguments<E>(std::forward<E>(expr))
+ {
+ }
template <size_t N>
- CMT_INLINE vec<To, N> operator()(cinput_t input, size_t index, vec_t<To, N>) const
+ friend KFR_INTRINSIC vec<To, N> get_elements(const expression_convert& self, cinput_t input,
+ size_t index, vec_shape<To, N>)
{
- return this->argument_first(input, index, vec_t<To, N>());
+ return self.argument_first(input, index, vec_shape<To, N>());
}
};
@@ -56,7 +80,7 @@ struct expression_iterator
struct iterator
{
T operator*() const { return get(); }
- T get() const { return expr.e1(cinput, position, vec_t<T, 1>())[0]; }
+ T get() const { return get_elements(expr.e1, cinput, position, vec_shape<T, 1>()).front(); }
iterator& operator++()
{
++position;
@@ -79,13 +103,13 @@ struct expression_iterator
} // namespace internal
template <typename To, typename E>
-CMT_INLINE internal::expression_convert<To, E> convert(E&& expr)
+KFR_INTRINSIC internal::expression_convert<To, E> convert(E&& expr)
{
return internal::expression_convert<To, E>(std::forward<E>(expr));
}
template <typename E1, typename T = value_type_of<E1>>
-CMT_INLINE internal::expression_iterator<T, E1> to_iterator(E1&& e1)
+KFR_INTRINSIC internal::expression_iterator<T, E1> to_iterator(E1&& e1)
{
return internal::expression_iterator<T, E1>(std::forward<E1>(e1));
}
@@ -99,30 +123,30 @@ inline auto sequence(const Ts&... list)
}
template <typename T = int>
-CMT_INLINE auto zeros()
+KFR_INTRINSIC auto zeros()
{
return lambda<T>([](cinput_t, size_t, auto x) { return zerovector(x); });
}
template <typename T = int>
-CMT_INLINE auto ones()
+KFR_INTRINSIC auto ones()
{
- return lambda<T>([](cinput_t, size_t, auto x) { return 1; });
+ return lambda<T>([](cinput_t, size_t, auto) { return 1; });
}
template <typename T = int>
-CMT_INLINE auto counter()
+KFR_INTRINSIC auto counter()
{
return lambda<T>([](cinput_t, size_t index, auto x) { return enumerate(x) + index; });
}
template <typename T1>
-CMT_INLINE auto counter(T1 start)
+KFR_INTRINSIC auto counter(T1 start)
{
return lambda<T1>([start](cinput_t, size_t index, auto x) { return enumerate(x) + index + start; });
}
template <typename T1, typename T2>
-CMT_INLINE auto counter(T1 start, T2 step)
+KFR_INTRINSIC auto counter(T1 start, T2 step)
{
return lambda<common_type<T1, T2>>(
[start, step](cinput_t, size_t index, auto x) { return (enumerate(x) + index) * step + start; });
@@ -149,10 +173,10 @@ namespace internal
template <typename T, typename E1>
struct expression_reader
{
- constexpr expression_reader(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {}
+ constexpr expression_reader(E1&& e1) CMT_NOEXCEPT : e1(std::forward<E1>(e1)) {}
T read() const
{
- const T result = e1(cinput, m_position, vec_t<T, 1>());
+ const T result = get_elements(e1, cinput, m_position, vec_shape<T, 1>());
m_position++;
return result;
}
@@ -162,7 +186,7 @@ struct expression_reader
template <typename T, typename E1>
struct expression_writer
{
- constexpr expression_writer(E1&& e1) noexcept : e1(std::forward<E1>(e1)) {}
+ constexpr expression_writer(E1&& e1) CMT_NOEXCEPT : e1(std::forward<E1>(e1)) {}
template <typename U>
void write(U value)
{
@@ -192,19 +216,20 @@ namespace internal
{
template <typename E1>
-struct expression_slice : expression_base<E1>
+struct expression_slice : expression_with_arguments<E1>
{
using value_type = value_type_of<E1>;
using T = value_type;
expression_slice(E1&& e1, size_t start, size_t size)
- : expression_base<E1>(std::forward<E1>(e1)), start(start),
+ : expression_with_arguments<E1>(std::forward<E1>(e1)), start(start),
new_size(size_min(size, size_sub(std::get<0>(this->args).size(), start)))
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_slice& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return this->argument_first(cinput, index + start, y);
+ return self.argument_first(cinput, index + self.start, y);
}
size_t size() const { return new_size; }
size_t start;
@@ -212,15 +237,16 @@ struct expression_slice : expression_base<E1>
};
template <typename E1>
-struct expression_reverse : expression_base<E1>
+struct expression_reverse : expression_with_arguments<E1>
{
using value_type = value_type_of<E1>;
using T = value_type;
- expression_reverse(E1&& e1) : expression_base<E1>(std::forward<E1>(e1)), expr_size(e1.size()) {}
+ expression_reverse(E1&& e1) : expression_with_arguments<E1>(std::forward<E1>(e1)), expr_size(e1.size()) {}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_reverse& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return reverse(this->argument_first(cinput, expr_size - index - N, y));
+ return reverse(self.argument_first(cinput, self.expr_size - index - N, y));
}
size_t size() const { return expr_size; }
size_t expr_size;
@@ -234,7 +260,7 @@ struct expression_linspace<T, false> : input_expression
{
using value_type = T;
- CMT_INLINE constexpr size_t size() const noexcept { return truncate_size; }
+ KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT { return truncate_size; }
expression_linspace(T start, T stop, size_t size, bool endpoint = false, bool truncate = false)
: start(start), offset((stop - start) / T(endpoint ? size - 1 : size)),
@@ -248,10 +274,11 @@ struct expression_linspace<T, false> : input_expression
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_linspace& self, cinput_t, size_t index,
+ vec_shape<T, N> x)
{
using TI = itype<T>;
- return T(start) + (enumerate(x) + cast<T>(cast<TI>(index))) * T(offset);
+ return T(self.start) + (enumerate(x) + static_cast<T>(static_cast<TI>(index))) * T(self.offset);
}
T start;
@@ -264,7 +291,7 @@ struct expression_linspace<T, true> : input_expression
{
using value_type = T;
- CMT_INLINE constexpr size_t size() const noexcept { return truncate_size; }
+ KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT { return truncate_size; }
expression_linspace(T start, T stop, size_t size, bool endpoint = false, bool truncate = false)
: start(start), stop(stop), invsize(1.0 / T(endpoint ? size - 1 : size)),
@@ -278,13 +305,15 @@ struct expression_linspace<T, true> : input_expression
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N> x) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_linspace& self, cinput_t, size_t index,
+ vec_shape<T, N> x)
{
using TI = itype<T>;
- return mix((enumerate(x) + cast<T>(cast<TI>(index))) * invsize, cast<T>(start), cast<T>(stop));
+ return mix((enumerate(x) + static_cast<T>(static_cast<TI>(index))) * self.invsize, self.start,
+ self.stop);
}
template <typename U, size_t N>
- CMT_INLINE static vec<U, N> mix(const vec<U, N>& t, U x, U y)
+ KFR_MEM_INTRINSIC static vec<U, N> mix(const vec<U, N>& t, U x, U y)
{
return (U(1.0) - t) * x + t * y;
}
@@ -296,16 +325,16 @@ struct expression_linspace<T, true> : input_expression
};
template <typename... E>
-struct expression_sequence : expression_base<E...>
+struct expression_sequence : expression_with_arguments<E...>
{
public:
- using base = expression_base<E...>;
+ using base = expression_with_arguments<E...>;
using value_type = common_type<value_type_of<E>...>;
using T = value_type;
template <typename... Expr_>
- CMT_INLINE expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) noexcept
+ KFR_MEM_INTRINSIC expression_sequence(const size_t (&segments)[base::count], Expr_&&... expr) CMT_NOEXCEPT
: base(std::forward<Expr_>(expr)...)
{
std::copy(std::begin(segments), std::end(segments), this->segments.begin() + 1);
@@ -314,20 +343,22 @@ public:
}
template <size_t N>
- CMT_NOINLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_sequence& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- std::size_t sindex = size_t(std::upper_bound(std::begin(segments), std::end(segments), index) - 1 -
- std::begin(segments));
- if (segments[sindex + 1] - index >= N)
- return get(cinput, index, sindex - 1, y);
+ std::size_t sindex =
+ size_t(std::upper_bound(std::begin(self.segments), std::end(self.segments), index) - 1 -
+ std::begin(self.segments));
+ if (self.segments[sindex + 1] - index >= N)
+ return get_elements(self, cinput, index, sindex - 1, y);
else
{
vec<T, N> result;
CMT_PRAGMA_CLANG(clang loop unroll_count(4))
for (size_t i = 0; i < N; i++)
{
- sindex = segments[sindex + 1] == index ? sindex + 1 : sindex;
- result.data()[i] = get(cinput, index, sindex - 1, vec_t<T, 1>())[0];
+ sindex = self.segments[sindex + 1] == index ? sindex + 1 : sindex;
+ result.data()[i] = get_elements(self, cinput, index, sindex - 1, vec_shape<T, 1>()).front();
index++;
}
return result;
@@ -336,10 +367,11 @@ public:
protected:
template <size_t N>
- CMT_NOINLINE vec<T, N> get(cinput_t cinput, size_t index, size_t expr_index, vec_t<T, N> y)
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_sequence& self, cinput_t cinput,
+ size_t index, size_t expr_index, vec_shape<T, N> y)
{
return cswitch(indicesfor_t<E...>(), expr_index,
- [&](auto val) { return this->argument(cinput, val, index, y); },
+ [&](auto val) { return self.argument(cinput, val, index, y); },
[&]() { return zerovector(y); });
}
@@ -347,20 +379,24 @@ protected:
};
template <typename Fn, typename E>
-struct expression_adjacent : expression_base<E>
+struct expression_adjacent : expression_with_arguments<E>
{
using value_type = value_type_of<E>;
using T = value_type;
- expression_adjacent(Fn&& fn, E&& e) : expression_base<E>(std::forward<E>(e)), fn(std::forward<Fn>(fn)) {}
+ expression_adjacent(Fn&& fn, E&& e)
+ : expression_with_arguments<E>(std::forward<E>(e)), fn(std::forward<Fn>(fn))
+ {
+ }
template <size_t N>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_adjacent& self, cinput_t cinput,
+ size_t index, vec_shape<T, N>)
{
- const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>());
- const vec<T, N> delayed = insertleft(data, in);
- data = in[N - 1];
- return this->fn(in, delayed);
+ const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
+ const vec<T, N> delayed = insertleft(self.data, in);
+ self.data = in[N - 1];
+ return self.fn(in, delayed);
}
Fn fn;
mutable value_type data = value_type(0);
@@ -370,7 +406,7 @@ struct expression_adjacent : expression_base<E>
/** @brief Returns the subrange of the given expression
*/
template <typename E1>
-CMT_INLINE internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t size = infinite_size)
+KFR_INTRINSIC internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t size = infinite_size)
{
return internal::expression_slice<E1>(std::forward<E1>(e1), start, size);
}
@@ -378,15 +414,15 @@ CMT_INLINE internal::expression_slice<E1> slice(E1&& e1, size_t start, size_t si
/** @brief Returns the expression truncated to the given size
*/
template <typename E1>
-CMT_INLINE internal::expression_slice<E1> truncate(E1&& e1, size_t size)
+KFR_INTRINSIC internal::expression_slice<E1> truncate(E1&& e1, size_t size)
{
return internal::expression_slice<E1>(std::forward<E1>(e1), 0, size);
}
-/** @brief Returns reversed expression
+/** @brief Returns the reversed expression
*/
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_reverse<E1> reverse(E1&& e1)
+KFR_INTRINSIC internal::expression_reverse<E1> reverse(E1&& e1)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return internal::expression_reverse<E1>(std::forward<E1>(e1));
@@ -401,23 +437,24 @@ CMT_INLINE internal::expression_reverse<E1> reverse(E1&& e1)
* @param truncate If ``true``, linspace returns exactly size elements, otherwise, returns infinite sequence
*/
template <typename T1, typename T2, bool precise = false, typename TF = ftype<common_type<T1, T2>>>
-CMT_INLINE internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size,
- bool endpoint = false, bool truncate = false)
+KFR_INTRINSIC internal::expression_linspace<TF, precise> linspace(T1 start, T2 stop, size_t size,
+ bool endpoint = false,
+ bool truncate = false)
{
return internal::expression_linspace<TF, precise>(start, stop, size, endpoint, truncate);
}
KFR_FN(linspace)
template <typename T, bool precise = false, typename TF = ftype<T>>
-CMT_INLINE internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size,
- bool endpoint = false)
+KFR_INTRINSIC internal::expression_linspace<TF, precise> symmlinspace(T symsize, size_t size,
+ bool endpoint = false)
{
return internal::expression_linspace<TF, precise>(symmetric_linspace, symsize, size, endpoint);
}
KFR_FN(symmlinspace)
template <size_t size, typename... E>
-CMT_INLINE internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens)
+KFR_INTRINSIC internal::expression_sequence<decay<E>...> gen_sequence(const size_t (&list)[size], E&&... gens)
{
static_assert(size == sizeof...(E), "Lists must be of equal length");
return internal::expression_sequence<decay<E>...>(list, std::forward<E>(gens)...);
@@ -428,7 +465,7 @@ KFR_FN(gen_sequence)
* @brief Returns template expression that returns the result of calling \f$ fn(x_i, x_{i-1}) \f$
*/
template <typename Fn, typename E1>
-CMT_INLINE internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1)
+KFR_INTRINSIC internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1)
{
return internal::expression_adjacent<Fn, E1>(std::forward<Fn>(fn), std::forward<E1>(e1));
}
@@ -436,37 +473,38 @@ CMT_INLINE internal::expression_adjacent<Fn, E1> adjacent(Fn&& fn, E1&& e1)
namespace internal
{
template <typename E>
-struct expression_padded : expression_base<E>
+struct expression_padded : expression_with_arguments<E>
{
using value_type = value_type_of<E>;
- CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; }
+ KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; }
expression_padded(value_type fill_value, E&& e)
- : expression_base<E>(std::forward<E>(e)), fill_value(fill_value), input_size(e.size())
+ : expression_with_arguments<E>(std::forward<E>(e)), fill_value(fill_value), input_size(e.size())
{
}
template <size_t N>
- vec<value_type, N> operator()(cinput_t cinput, size_t index, vec_t<value_type, N> y) const
+ KFR_INTRINSIC friend vec<value_type, N> get_elements(const expression_padded& self, cinput_t cinput,
+ size_t index, vec_shape<value_type, N> y)
{
- if (index >= input_size)
+ if (index >= self.input_size)
{
- return fill_value;
+ return self.fill_value;
}
- else if (index + N <= input_size)
+ else if (index + N <= self.input_size)
{
- return this->argument_first(cinput, index, y);
+ return self.argument_first(cinput, index, y);
}
else
{
- vec<value_type, N> x;
+ vec<value_type, N> x{};
for (size_t i = 0; i < N; i++)
{
- if (index + i < input_size)
- x[i] = this->argument_first(cinput, index + i, vec_t<value_type, 1>())[0];
+ if (index + i < self.input_size)
+ x[i] = self.argument_first(cinput, index + i, vec_shape<value_type, 1>()).front();
else
- x[i] = fill_value;
+ x[i] = self.fill_value;
}
return x;
}
@@ -507,44 +545,45 @@ private:
};
template <typename... E>
-struct expression_pack : expression_base<E...>
+struct expression_pack : expression_with_arguments<E...>
{
constexpr static size_t count = sizeof...(E);
- expression_pack(E&&... e) : expression_base<E...>(std::forward<E>(e)...) {}
+ expression_pack(E&&... e) : expression_with_arguments<E...>(std::forward<E>(e)...) {}
using value_type = vec<common_type<value_type_of<E>...>, count>;
using T = value_type;
- using expression_base<E...>::size;
+ using expression_with_arguments<E...>::size;
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pack& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return this->call(cinput, fn::packtranspose(), index, y);
+ return self.call(cinput, fn::packtranspose(), index, y);
}
};
template <typename... E>
-struct expression_unpack : private expression_base<E...>, output_expression
+struct expression_unpack : private expression_with_arguments<E...>, output_expression
{
- using expression_base<E...>::begin_block;
- using expression_base<E...>::end_block;
+ using expression_with_arguments<E...>::begin_block;
+ using expression_with_arguments<E...>::end_block;
using output_expression::begin_block;
using output_expression::end_block;
constexpr static size_t count = sizeof...(E);
- expression_unpack(E&&... e) : expression_base<E...>(std::forward<E>(e)...) {}
+ expression_unpack(E&&... e) : expression_with_arguments<E...>(std::forward<E>(e)...) {}
- using expression_base<E...>::size;
+ using expression_with_arguments<E...>::size;
template <typename U, size_t N>
- CMT_INLINE void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x)
+ KFR_MEM_INTRINSIC void operator()(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x)
{
- output(coutput, index, x, csizeseq_t<count>());
+ output(coutput, index, x, csizeseq<count>);
}
template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
- CMT_INLINE expression_unpack& operator=(Input&& input)
+ KFR_MEM_INTRINSIC expression_unpack& operator=(Input&& input)
{
process(*this, std::forward<Input>(input));
return *this;
@@ -554,7 +593,7 @@ private:
template <typename U, size_t N, size_t... indices>
void output(coutput_t coutput, size_t index, const vec<vec<U, count>, N>& x, csizes_t<indices...>)
{
- const vec<vec<U, N>, count> xx = compcast<vec<U, N>>(transpose<count>(flatten(x)));
+ const vec<vec<U, N>, count> xx = vec<vec<U, N>, count>::from_flatten(transpose<count>(flatten(x)));
swallow{ (std::get<indices>(this->args)(coutput, index, xx[indices]), void(), 0)... };
}
};
@@ -600,12 +639,13 @@ task_partition<OutExpr, InExpr> partition(OutExpr&& output, InExpr&& input, size
{
static_assert(!is_infinite<OutExpr>::value || !is_infinite<InExpr>::value, "");
- minimum_size = minimum_size == 0 ? platform<T>::vector_width * 8 : minimum_size;
+ minimum_size = minimum_size == 0 ? vector_width<T> * 8 : minimum_size;
const size_t size = size_min(output.size(), input.size());
- const size_t chunk_size = align_up(std::max(size / count, minimum_size), platform<T>::vector_width);
+ const size_t chunk_size = align_up(std::max(size / count, minimum_size), vector_width<T>);
task_partition<OutExpr, InExpr> result(std::forward<OutExpr>(output), std::forward<InExpr>(input), size,
chunk_size, (size + chunk_size - 1) / chunk_size);
return result;
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/bitwise.hpp b/include/kfr/base/bitwise.hpp
@@ -1,136 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "constants.hpp"
-#include "vec.hpp"
-
-namespace kfr
-{
-
-CMT_INLINE float bitwisenot(float x) { return fbitcast(~ubitcast(x)); }
-CMT_INLINE float bitwiseor(float x, float y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
-CMT_INLINE float bitwiseand(float x, float y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
-CMT_INLINE float bitwiseandnot(float x, float y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
-CMT_INLINE float bitwisexor(float x, float y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
-CMT_INLINE double bitwisenot(double x) { return fbitcast(~ubitcast(x)); }
-CMT_INLINE double bitwiseor(double x, double y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
-CMT_INLINE double bitwiseand(double x, double y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
-CMT_INLINE double bitwiseandnot(double x, double y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
-CMT_INLINE double bitwisexor(double x, double y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
-
-/// @brief Bitwise Not
-template <typename T1>
-CMT_INLINE T1 bitwisenot(const T1& x)
-{
- return ~x;
-}
-KFR_FN(bitwisenot)
-
-/// @brief Bitwise And
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwiseand(const T1& x, const T2& y)
-{
- return x & y;
-}
-template <typename T>
-constexpr CMT_INLINE T bitwiseand(initialvalue<T>)
-{
- return constants<T>::allones();
-}
-KFR_FN(bitwiseand)
-
-/// @brief Bitwise And-Not
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y)
-{
- return x & ~y;
-}
-template <typename T>
-constexpr inline T bitwiseandnot(initialvalue<T>)
-{
- return constants<T>::allones();
-}
-KFR_FN(bitwiseandnot)
-
-/// @brief Bitwise Or
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwiseor(const T1& x, const T2& y)
-{
- return x | y;
-}
-template <typename T>
-constexpr CMT_INLINE T bitwiseor(initialvalue<T>)
-{
- return subtype<T>(0);
-}
-KFR_FN(bitwiseor)
-
-/// @brief Bitwise Xor (Exclusive Or)
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> bitwisexor(const T1& x, const T2& y)
-{
- return x ^ y;
-}
-template <typename T>
-constexpr CMT_INLINE T bitwisexor(initialvalue<T>)
-{
- return subtype<T>();
-}
-KFR_FN(bitwisexor)
-
-/// @brief Bitwise Left shift
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> shl(const T1& left, const T2& right)
-{
- return left << right;
-}
-KFR_FN(shl)
-
-/// @brief Bitwise Right shift
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> shr(const T1& left, const T2& right)
-{
- return left >> right;
-}
-KFR_FN(shr)
-
-/// @brief Bitwise Left Rotate
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> rol(const T1& left, const T2& right)
-{
- return shl(left, right) | shr(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
-}
-KFR_FN(rol)
-
-/// @brief Bitwise Right Rotate
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> ror(const T1& left, const T2& right)
-{
- return shr(left, right) | shl(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
-}
-KFR_FN(ror)
-} // namespace kfr
diff --git a/include/kfr/base/clamp.hpp b/include/kfr/base/clamp.hpp
@@ -1,62 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/clamp.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns the first argument clamped to a range [lo, hi]
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
- typename Tout = common_type<T1, T2, T3>>
-KFR_INTRIN Tout clamp(const T1& x, const T2& lo, const T3& hi)
-{
- return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi));
-}
-
-/// @brief Creates an expression that returns the first argument clamped to a range [lo, hi]
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INTRIN internal::expression_function<fn::clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi)
-{
- return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) };
-}
-
-/// @brief Returns the first argument clamped to a range [0, hi]
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout clamp(const T1& x, const T2& hi)
-{
- return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(hi));
-}
-
-/// @brief Creates an expression that returns the first argument clamped to a range [0, hi]
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::clamp, E1, E2> clamp(E1&& x, E2&& hi)
-{
- return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(hi) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/comparison.hpp b/include/kfr/base/comparison.hpp
@@ -1,149 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "constants.hpp"
-#include "expression.hpp"
-#include "vec.hpp"
-
-namespace kfr
-{
-
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> equal(const T1& x, const T2& y)
-{
- return x == y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> notequal(const T1& x, const T2& y)
-{
- return x != y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> less(const T1& x, const T2& y)
-{
- return x < y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> greater(const T1& x, const T2& y)
-{
- return x > y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> lessorequal(const T1& x, const T2& y)
-{
- return x <= y;
-}
-template <typename T1, typename T2>
-inline maskfor<common_type<T1, T2>> greaterorequal(const T1& x, const T2& y)
-{
- return x >= y;
-}
-KFR_FN(equal)
-KFR_FN(notequal)
-KFR_FN(less)
-KFR_FN(greater)
-KFR_FN(lessorequal)
-KFR_FN(greaterorequal)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::equal, E1, E2> operator==(E1&& e1, E2&& e2)
-{
- return { fn::equal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::notequal, E1, E2> operator!=(E1&& e1, E2&& e2)
-{
- return { fn::notequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::less, E1, E2> operator<(E1&& e1, E2&& e2)
-{
- return { fn::less(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::greater, E1, E2> operator>(E1&& e1, E2&& e2)
-{
- return { fn::greater(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::lessorequal, E1, E2> operator<=(E1&& e1, E2&& e2)
-{
- return { fn::lessorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::greaterorequal, E1, E2> operator>=(E1&& e1, E2&& e2)
-{
- return { fn::greaterorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isnan(const vec<T, N>& x)
-{
- return x != x;
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isinf(const vec<T, N>& x)
-{
- return x == constants<T>::infinity || x == -constants<T>::infinity;
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isfinite(const vec<T, N>& x)
-{
- return !isnan(x) && !isinf(x);
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> isnegative(const vec<T, N>& x)
-{
- return (x & constants<T>::highbitmask()) != 0;
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> ispositive(const vec<T, N>& x)
-{
- return !isnegative(x);
-}
-
-template <typename T, size_t N>
-CMT_INLINE mask<T, N> iszero(const vec<T, N>& x)
-{
- return x == T();
-}
-
-template <typename T1, typename T2, typename T3>
-KFR_SINTRIN maskfor<common_type<T1, T2, T3>> inrange(const T1& x, const T2& min, const T3& max)
-{
- return x >= min && x <= max;
-}
-} // namespace kfr
diff --git a/include/kfr/base/compiletime.hpp b/include/kfr/base/compiletime.hpp
@@ -1,84 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "constants.hpp"
-#include "operators.hpp"
-#include "types.hpp"
-
-namespace kfr
-{
-
-namespace compiletime
-{
-
-template <typename T>
-constexpr inline T select(bool c, T x, T y)
-{
- return c ? x : y;
-}
-template <typename T>
-constexpr inline T trunc(T x)
-{
- return static_cast<T>(static_cast<long long>(x));
-}
-template <typename T>
-constexpr inline T abs(T x)
-{
- return x < T() ? -x : x;
-}
-template <typename T>
-constexpr inline T mulsign(T x, T y)
-{
- return y < T() ? -x : x;
-}
-template <typename T>
-constexpr inline T sin(T x)
-{
- x = x - trunc(x / c_pi<T, 2>) * c_pi<T, 2>;
- constexpr T c2 = -0.16665853559970855712890625;
- constexpr T c4 = +8.31427983939647674560546875e-3;
- constexpr T c6 = -1.85423981747590005397796630859375e-4;
-
- x -= c_pi<T>;
- T y = abs(x);
- y = select(y > c_pi<T, 1, 2>, c_pi<T> - y, y);
- y = mulsign(y, -x);
-
- const T y2 = y * y;
- T formula = c6;
- const T y3 = y2 * y;
- formula = fmadd(formula, y2, c4);
- formula = fmadd(formula, y2, c2);
- formula = formula * y3 + y;
- return formula;
-}
-template <typename T>
-constexpr inline T cos(T x)
-{
- return sin(x + c_pi<T, 1, 2>);
-}
-} // namespace compiletime
-} // namespace kfr
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -1,967 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "abs.hpp"
-#include "atan.hpp"
-#include "constants.hpp"
-#include "function.hpp"
-#include "hyperbolic.hpp"
-#include "log_exp.hpp"
-#include "min_max.hpp"
-#include "operators.hpp"
-#include "select.hpp"
-#include "sin_cos.hpp"
-#include "sqrt.hpp"
-
-#ifdef KFR_STD_COMPLEX
-#include <complex>
-#endif
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4814))
-
-namespace kfr
-{
-#ifdef KFR_STD_COMPLEX
-
-template <typename T>
-using complex = std::complex<T>;
-
-#else
-#ifndef KFR_CUSTOM_COMPLEX
-
-/**
- * @brief Represents the complex numbers. If KFR_STD_COMPLEX is defined, then kfr::complex is an alias for
- * std::complex.
- */
-template <typename T>
-struct complex
-{
- static_assert(is_simd_type<T>::value, "Incorrect type for complex");
- constexpr static bool is_pod = true;
- constexpr complex() noexcept = default;
- constexpr complex(T re) noexcept : re(re), im(0) {}
- constexpr complex(T re, T im) noexcept : re(re), im(im) {}
- constexpr complex(const complex&) noexcept = default;
- constexpr complex(complex&&) noexcept = default;
- template <typename U>
- constexpr complex(const complex<U>& other) noexcept
- : re(static_cast<T>(other.re)), im(static_cast<T>(other.im))
- {
- }
- template <typename U>
- constexpr complex(complex<U>&& other) noexcept : re(std::move(other.re)), im(std::move(other.im))
- {
- }
-#ifdef CMT_COMPILER_GNU
- constexpr complex& operator=(const complex&) noexcept = default;
- constexpr complex& operator=(complex&&) noexcept = default;
-#else
- complex& operator=(const complex&) = default;
- complex& operator=(complex&&) = default;
-#endif
- constexpr const T& real() const noexcept { return re; }
- constexpr const T& imag() const noexcept { return im; }
- constexpr void real(T value) noexcept { re = value; }
- constexpr void imag(T value) noexcept { im = value; }
- T re;
- T im;
-
- KFR_INTRIN friend complex operator+(const complex& x, const complex& y)
- {
- return (make_vector(x) + make_vector(y))[0];
- }
- KFR_INTRIN friend complex operator-(const complex& x, const complex& y)
- {
- return (make_vector(x) - make_vector(y))[0];
- }
- KFR_INTRIN friend complex operator*(const complex& x, const complex& y)
- {
- return (make_vector(x) * make_vector(y))[0];
- }
- KFR_INTRIN friend complex operator/(const complex& x, const complex& y)
- {
- return (make_vector(x) / make_vector(y))[0];
- }
-
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator+(const complex& x, const U& y)
- {
- return static_cast<C>(x) + static_cast<C>(y);
- }
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator-(const complex& x, const U& y)
- {
- return static_cast<C>(x) - static_cast<C>(y);
- }
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator*(const complex& x, const U& y)
- {
- return static_cast<C>(x) * static_cast<C>(y);
- }
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator/(const complex& x, const U& y)
- {
- return static_cast<C>(x) / static_cast<C>(y);
- }
-
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator+(const U& x, const complex& y)
- {
- return static_cast<C>(x) + static_cast<C>(y);
- }
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator-(const U& x, const complex& y)
- {
- return static_cast<C>(x) - static_cast<C>(y);
- }
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator*(const U& x, const complex& y)
- {
- return static_cast<C>(x) * static_cast<C>(y);
- }
- template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
- KFR_INTRIN friend C operator/(const U& x, const complex& y)
- {
- return static_cast<C>(x) / static_cast<C>(y);
- }
- KFR_INTRIN friend complex operator-(const complex& x) { return (-make_vector(x))[0]; }
-};
-#endif
-#endif
-} // namespace kfr
-namespace cometa
-{
-template <typename T>
-struct compound_type_traits<kfr::complex<T>>
-{
- constexpr static size_t width = 2;
- constexpr static size_t deep_width = width * compound_type_traits<T>::width;
- using subtype = T;
- using deep_subtype = cometa::deep_subtype<T>;
- constexpr static bool is_scalar = false;
- constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
- template <typename U>
- using rebind = kfr::complex<U>;
- template <typename U>
- using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>;
-
- static constexpr subtype at(const kfr::complex<T>& value, size_t index)
- {
- return index == 0 ? value.real() : value.imag();
- }
-};
-} // namespace cometa
-namespace kfr
-{
-
-/// @brief Alias for complex<f32>
-using c32 = complex<f32>;
-
-/// @brief Alias for complex<f64>
-using c64 = complex<f64>;
-
-/// @brief Alias for complex<fbase>
-using cbase = complex<fbase>;
-
-namespace internal
-{
-template <typename T>
-constexpr inline vec<T, 2> vcomplex(const complex<T>& v)
-{
- return vec<T, 2>(v.real(), v.imag());
-}
-} // namespace internal
-
-/// @brief vec<> specialization for complex numbers. Implements all operators
-template <typename T, size_t N>
-struct vec<complex<T>, N> : private vec<T, 2 * N>
-{
- using base = vec<T, 2 * N>;
-
- using value_type = complex<T>;
- constexpr static size_t size() noexcept { return N; }
-
- using scalar_type = T;
- constexpr static size_t scalar_size() noexcept { return 2 * N; }
-
- using simd_type = typename base::simd_type;
-
- constexpr vec() noexcept = default;
- constexpr vec(const vec&) noexcept = default;
- CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default;
- template <int = 0>
- constexpr vec(const simd_type& simd) noexcept : base(simd)
- {
- }
- KFR_I_CE vec(czeros_t) noexcept : base(czeros) {}
- KFR_I_CE vec(cones_t) noexcept : base(cones) {}
- KFR_I_CE vec(const value_type& s) noexcept : base(repeat<N>(vec<T, 2>(s.real(), s.imag()))) {}
-
- template <typename U>
- KFR_I_CE vec(const complex<U>& s) noexcept
- : base(repeat<N>(vec<T, 2>(static_cast<T>(s.real()), static_cast<T>(s.imag()))))
- {
- }
- template <typename U>
- KFR_I_CE vec(const vec<complex<U>, N>& v) noexcept : base(static_cast<vec<T, N * 2>>(v.flatten()))
- {
- }
-
- explicit KFR_I_CE vec(const vec<T, N * 2>& v) noexcept : base(v) {}
-
- // from real
- KFR_I_CE vec(const T& r) noexcept : base(interleave(vec<T, N>(r), vec<T, N>(czeros))) {}
- // from real
- template <typename U, typename = enable_if<std::is_convertible<U, T>::value>>
- KFR_I_CE vec(const vec<U, N>& r) noexcept : base(interleave(vec<T, N>(r), vec<T, N>(czeros)))
- {
- }
-
- // from list of vectors
- template <typename... Us>
- KFR_I_CE vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
- : base(internal::vcomplex(s0), internal::vcomplex(s1),
- internal::vcomplex(static_cast<value_type>(rest))...)
- {
- }
-
- template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(value_type) * N)>
- KFR_I_CE static vec frombits(const vec<U, M>& v) noexcept
- {
- return vec(vec<T, scalar_size()>::frombits(v.flatten()));
- }
-
-#define KFR_B(x) static_cast<const base&>(x)
- // math / bitwise / comparison operators
- KFR_I_CE friend vec operator+(const vec& x) noexcept { return x; }
- KFR_I_CE friend vec operator-(const vec& x) noexcept { return vec(-KFR_B(x)); }
- KFR_I_CE friend vec operator~(const vec& x) noexcept { return vec(~KFR_B(x)); }
-
- KFR_I_CE friend vec operator+(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) + KFR_B(y)); }
- KFR_I_CE friend vec operator-(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) - KFR_B(y)); }
- CMT_GNU_CONSTEXPR friend vec operator*(const vec& x, const vec& y) noexcept
- {
- const vec<scalar_type, N* 2> xx = x;
- const vec<scalar_type, N* 2> yy = y;
- return vec(subadd(xx * dupeven(yy), swap<2>(xx) * dupodd(yy)));
- }
- CMT_GNU_CONSTEXPR friend vec operator/(const vec& x, const vec& y) noexcept
- {
- const vec<scalar_type, N* 2> xx = x;
- const vec<scalar_type, N* 2> yy = y;
- const vec<scalar_type, N* 2> m = (sqr(dupeven(yy)) + sqr(dupodd(yy)));
- return vec(swap<2>(subadd(swap<2>(xx) * dupeven(yy), xx * dupodd(yy)) / m));
- }
-
- KFR_I_CE friend vec operator&(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) & KFR_B(y)); }
- KFR_I_CE friend vec operator|(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) | KFR_B(y)); }
- KFR_I_CE friend vec operator^(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) ^ KFR_B(y)); }
-
- KFR_I_CE friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; }
- KFR_I_CE friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; }
- KFR_I_CE friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; }
- KFR_I_CE friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; }
-
- KFR_I_CE friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; }
- KFR_I_CE friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; }
- KFR_I_CE friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; }
-
- KFR_I_CE friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
- KFR_I_CE friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
- KFR_I_CE friend vec operator++(vec& x, int) noexcept
- {
- const vec z = x;
- ++x;
- return z;
- }
- KFR_I_CE friend vec operator--(vec& x, int) noexcept
- {
- const vec z = x;
- --x;
- return z;
- }
-
- // shuffle
- template <size_t... indices>
- KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
- {
- return *base::shuffle(scale<2, indices...>());
- }
- template <size_t... indices>
- KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
- {
- return *base::shuffle(y, scale<2, indices...>());
- }
-
- // element access
- struct element;
- KFR_I_CE value_type operator[](size_t index) const noexcept { return get(index); }
- KFR_I_CE element operator[](size_t index) noexcept { return { *this, index }; }
-
- KFR_I_CE value_type get(size_t index) const noexcept
- {
- return reinterpret_cast<const value_type(&)[N]>(*this)[index];
- }
- KFR_I_CE void set(size_t index, const value_type& s) noexcept
- {
- reinterpret_cast<value_type(&)[N]>(*this)[index] = s;
- }
- template <size_t index>
- KFR_I_CE value_type get(csize_t<index>) const noexcept
- {
- return static_cast<const base&>(*this).shuffle(csizeseq_t<2, index * 2>());
- }
- template <size_t index>
- KFR_I_CE void set(csize_t<index>, const value_type& s) noexcept
- {
- *this = vec(static_cast<const base&>(*this))
- .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * 2>() &&
- csizeseq_t<N>() < csize_t<(index + 1) * 2>()) *
- N);
- }
- struct element
- {
- KFR_I_CE operator value_type() const noexcept { return v.get(index); }
- element& operator=(const value_type& s) noexcept
- {
- v.set(index, s);
- return *this;
- }
-
- element& operator=(const element& s) noexcept
- {
- v.set(index, static_cast<value_type>(s));
- return *this;
- }
- template <typename U, size_t M>
- element& operator=(const typename vec<U, M>::element& s) noexcept
- {
- v.set(index, static_cast<value_type>(static_cast<U>(s)));
- return *this;
- }
-
- vec& v;
- size_t index;
- };
-
- template <bool aligned = false>
- explicit KFR_I_CE vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept
- : base(ptr_cast<T>(src), cbool_t<aligned>())
- {
- }
- template <bool aligned = false>
- const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept
- {
- base::write(ptr_cast<T>(dest), cbool_t<aligned>());
- return *this;
- }
-
- const base& flatten() const noexcept { return *this; }
- simd_type operator*() const noexcept { return base::operator*(); }
- simd_type& operator*() noexcept { return base::operator*(); }
-};
-
-/// @brief Returns vector of complex values with real part duplicated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
-{
- return compcast<complex<T>>(dupeven(compcast<T>(x)));
-}
-KFR_FN(cdupreal)
-
-/// @brief Returns vector of complex values with imaginary part duplicated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
-{
- return compcast<complex<T>>(dupodd(compcast<T>(x)));
-}
-KFR_FN(cdupimag)
-
-/// @brief Returns vector of complex values with real and imaginary parts swapped
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
-{
- return compcast<complex<T>>(swap<2>(compcast<T>(x)));
-}
-KFR_FN(cswapreim)
-
-/// @brief Returns vector of complex values with real part negated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x)
-{
- return x ^ complex<T>(-T(), T());
-}
-KFR_FN(cnegreal)
-
-/// @brief Returns vector of complex values with imaginary part negated
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x)
-{
- return x ^ complex<T>(T(), -T());
-}
-KFR_FN(cnegimag)
-
-namespace internal
-{
-template <typename T>
-struct is_complex_impl : std::false_type
-{
-};
-template <typename T>
-struct is_complex_impl<complex<T>> : std::true_type
-{
-};
-
-// vector<complex> to vector<complex>
-template <typename To, typename From, size_t N>
-struct conversion<vec<complex<To>, N>, vec<complex<From>, N>>
-{
- static_assert(!is_compound<To>::value, "");
- static_assert(!is_compound<From>::value, "");
- static vec<complex<To>, N> cast(const vec<complex<From>, N>& value)
- {
- return builtin_convertvector<complex<To>>(value);
- }
-};
-
-// vector to vector<complex>
-template <typename To, typename From, size_t N>
-struct conversion<vec<complex<To>, N>, vec<From, N>>
-{
- static_assert(!is_compound<To>::value, "");
- static_assert(!is_compound<From>::value, "");
- static vec<complex<To>, N> cast(const vec<From, N>& value)
- {
- const vec<To, N> casted = static_cast<vec<To, N>>(value);
- return *interleave(casted, zerovector(casted));
- }
-};
-
-} // namespace internal
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
-{
- return compcast<complex<T>>(x);
-}
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
-{
- return compcast<T>(x);
-}
-
-/// @brief Returns the real part of the complex value
-template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)>
-constexpr CMT_INLINE T real(const T& value)
-{
- return value;
-}
-
-/// @brief Returns the real part of the complex value
-template <typename T>
-constexpr CMT_INLINE T real(const complex<T>& value)
-{
- return value.real();
-}
-
-/// @brief Returns the real part of the complex value
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> real(const vec<complex<T>, N>& value)
-{
- return even(compcast<T>(value));
-}
-
-template <typename T>
-using realtype = decltype(kfr::real(std::declval<T>()));
-template <typename T>
-using realftype = ftype<decltype(kfr::real(std::declval<T>()))>;
-
-KFR_FN(real)
-
-/// @brief Returns the real part of the complex value
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::real, E1> real(E1&& x)
-{
- return { {}, std::forward<E1>(x) };
-}
-
-/// @brief Returns the imaginary part of the complex value
-template <typename T>
-constexpr CMT_INLINE T imag(const complex<T>& value)
-{
- return value.imag();
-}
-
-/// @brief Returns the imaginary part of the complex value
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> imag(const vec<complex<T>, N>& value)
-{
- return odd(compcast<T>(value));
-}
-KFR_FN(imag)
-
-/// @brief Returns the imaginary part of the complex value
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::imag, E1> imag(E1&& x)
-{
- return { {}, std::forward<E1>(x) };
-}
-
-/// @brief Constructs complex value from real and imaginary parts
-template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>>
-constexpr CMT_INLINE vec<complex<T>, N> make_complex(const vec<T1, N>& real, const vec<T2, N>& imag = T2(0))
-{
- return compcast<complex<T>>(interleave(cast<T>(real), cast<T>(imag)));
-}
-
-/// @brief Constructs complex value from real and imaginary parts
-template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>>
-constexpr CMT_INLINE complex<T> make_complex(T1 real, T2 imag = T2(0))
-{
- return complex<T>(cast<T>(real), cast<T>(imag));
-}
-
-namespace intrinsics
-{
-template <typename T, size_t N>
-CMT_INLINE vec<complex<T>, N> cconj(const vec<complex<T>, N>& x)
-{
- return cnegimag(x);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> csin(const vec<complex<T>, N>& x)
-{
- return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> csinh(const vec<complex<T>, N>& x)
-{
- return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> ccos(const vec<complex<T>, N>& x)
-{
- return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
-{
- return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabs(const vec<complex<T>, N>& x)
-{
- const vec<T, N* 2> xx = sqr(cdecom(x));
- return sqrt(even(xx) + odd(xx));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> carg(const vec<complex<T>, N>& x)
-{
- const vec<T, N* 2> xx = cdecom(x);
- return atan2(even(xx), odd(xx));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> clog(const vec<complex<T>, N>& x)
-{
- return make_complex(log(cabs(x)), carg(x));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> clog2(const vec<complex<T>, N>& x)
-{
- return clog(x) * c_recip_log_2<T>;
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> clog10(const vec<complex<T>, N>& x)
-{
- return clog(x) * c_recip_log_10<T>;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cexp(const vec<complex<T>, N>& x)
-{
- return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x)
-{
- return cexp(x * c_log_2<T>);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x)
-{
- return cexp(x * c_log_10<T>);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> polar(const vec<complex<T>, N>& x)
-{
- return make_complex(cabs(x), carg(x));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
-{
- return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x))));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabsdup(const vec<T, N>& x)
-{
- x = sqr(x);
- return sqrt(x + swap<2>(x));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x)
-{
- const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5);
- return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t));
-}
-
-KFR_I_CONVERTER(cconj)
-KFR_I_CONVERTER(csin)
-KFR_I_CONVERTER(csinh)
-KFR_I_CONVERTER(ccos)
-KFR_I_CONVERTER(ccosh)
-KFR_I_CONVERTER(clog)
-KFR_I_CONVERTER(clog2)
-KFR_I_CONVERTER(clog10)
-KFR_I_CONVERTER(cexp)
-KFR_I_CONVERTER(cexp2)
-KFR_I_CONVERTER(cexp10)
-KFR_I_CONVERTER(polar)
-KFR_I_CONVERTER(cartesian)
-KFR_I_CONVERTER(csqrt)
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabs(const vec<T, N>& a)
-{
- return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a)));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> carg(const vec<T, N>& a)
-{
- return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a)));
-}
-template <typename T1>
-KFR_SINTRIN realtype<T1> cabs(const T1& a)
-{
- using vecout = vec1<T1>;
- return to_scalar(intrinsics::cabs(vecout(a)));
-}
-template <typename T1>
-KFR_SINTRIN realtype<T1> carg(const T1& a)
-{
- using vecout = vec1<T1>;
- return to_scalar(intrinsics::carg(vecout(a)));
-}
-} // namespace intrinsics
-
-KFR_I_FN(cconj)
-KFR_I_FN(csin)
-KFR_I_FN(csinh)
-KFR_I_FN(ccos)
-KFR_I_FN(ccosh)
-KFR_I_FN(cabs)
-KFR_I_FN(carg)
-KFR_I_FN(clog)
-KFR_I_FN(clog2)
-KFR_I_FN(clog10)
-KFR_I_FN(cexp)
-KFR_I_FN(cexp2)
-KFR_I_FN(cexp10)
-KFR_I_FN(polar)
-KFR_I_FN(cartesian)
-KFR_I_FN(csqrt)
-
-/// @brief Returns the sine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 csin(const T1& x)
-{
- return intrinsics::csin(x);
-}
-
-/// @brief Returns template expression that returns the sine of the the complex value x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::csin, E1> csin(E1&& x)
-{
- return { fn::csin(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic sine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 csinh(const T1& x)
-{
- return intrinsics::csinh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic sine of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::csinh, E1> csinh(E1&& x)
-{
- return { fn::csinh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the cosine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 ccos(const T1& x)
-{
- return intrinsics::ccos(x);
-}
-
-/// @brief Returns template expression that returns the cosine of the the complex value x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::ccos, E1> ccos(E1&& x)
-{
- return { fn::ccos(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cosine of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 ccosh(const T1& x)
-{
- return intrinsics::ccosh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cosine of the the complex value x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::ccosh, E1> ccosh(E1&& x)
-{
- return { fn::ccosh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the absolute value (magnitude) of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC realtype<T1> cabs(const T1& x)
-{
- return intrinsics::cabs(x);
-}
-
-/// @brief Returns template expression that returns the absolute value (magnitude) of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cabs, E1> cabs(E1&& x)
-{
- return { fn::cabs(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the phase angle (argument) of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC realtype<T1> carg(const T1& x)
-{
- return intrinsics::carg(x);
-}
-
-/// @brief Returns template expression that returns the phase angle (argument) of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::carg, E1> carg(E1&& x)
-{
- return { fn::carg(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the complex conjugate of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cconj(const T1& x)
-{
- return intrinsics::cconj(x);
-}
-
-/// @brief Returns template expression that returns the complex conjugate of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cconj, E1> cconj(E1&& x)
-{
- return { fn::cconj(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the natural logarithm of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 clog(const T1& x)
-{
- return intrinsics::clog(x);
-}
-
-/// @brief Returns template expression that returns the natural logarithm of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::clog, E1> clog(E1&& x)
-{
- return { fn::clog(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the binary (base-2) logarithm of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 clog2(const T1& x)
-{
- return intrinsics::clog2(x);
-}
-
-/// @brief Returns template expression that returns the binary (base-2) logarithm of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::clog2, E1> clog2(E1&& x)
-{
- return { fn::clog2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the common (base-10) logarithm of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 clog10(const T1& x)
-{
- return intrinsics::clog10(x);
-}
-
-/// @brief Returns template expression that returns the common (base-10) logarithm of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::clog10, E1> clog10(E1&& x)
-{
- return { fn::clog10(), std::forward<E1>(x) };
-}
-
-/// @brief Returns \f$e\f$ raised to the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cexp(const T1& x)
-{
- return intrinsics::cexp(x);
-}
-
-/// @brief Returns template expression that returns \f$e\f$ raised to the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cexp, E1> cexp(E1&& x)
-{
- return { fn::cexp(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 2 raised to the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cexp2(const T1& x)
-{
- return intrinsics::cexp2(x);
-}
-
-/// @brief Returns template expression that returns 2 raised to the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cexp2, E1> cexp2(E1&& x)
-{
- return { fn::cexp2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 10 raised to the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cexp10(const T1& x)
-{
- return intrinsics::cexp10(x);
-}
-
-/// @brief Returns template expression that returns 10 raised to the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cexp10, E1> cexp10(E1&& x)
-{
- return { fn::cexp10(), std::forward<E1>(x) };
-}
-
-/// @brief Converts complex number to polar
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 polar(const T1& x)
-{
- return intrinsics::polar(x);
-}
-
-/// @brief Returns template expression that converts complex number to polar
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::polar, E1> polar(E1&& x)
-{
- return { fn::polar(), std::forward<E1>(x) };
-}
-
-/// @brief Converts complex number to cartesian
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 cartesian(const T1& x)
-{
- return intrinsics::cartesian(x);
-}
-
-/// @brief Returns template expression that converts complex number to cartesian
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cartesian, E1> cartesian(E1&& x)
-{
- return { fn::cartesian(), std::forward<E1>(x) };
-}
-
-/// @brief Returns square root of the complex number x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 csqrt(const T1& x)
-{
- return intrinsics::csqrt(x);
-}
-
-/// @brief Returns template expression that returns square root of the complex number x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::csqrt, E1> csqrt(E1&& x)
-{
- return { fn::csqrt(), std::forward<E1>(x) };
-}
-} // namespace kfr
-
-namespace std
-{
-template <typename T1, typename T2>
-struct common_type<kfr::complex<T1>, kfr::complex<T2>>
-{
- using type = kfr::complex<typename common_type<T1, T2>::type>;
-};
-template <typename T1, typename T2>
-struct common_type<kfr::complex<T1>, T2>
-{
- using type = kfr::complex<typename common_type<T1, T2>::type>;
-};
-template <typename T1, typename T2>
-struct common_type<T1, kfr::complex<T2>>
-{
- using type = kfr::complex<typename common_type<T1, T2>::type>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>>
-{
- using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>>
-{
- using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::complex<T1>, kfr::vec<T2, N>>
-{
- using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<T1, N>, kfr::complex<T2>>
-{
- using type = kfr::vec<kfr::complex<typename common_type<T1, T2>::type>, N>;
-};
-} // namespace std
-
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/constants.hpp b/include/kfr/base/constants.hpp
@@ -1,299 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "types.hpp"
-#include <limits>
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4309))
-CMT_PRAGMA_MSVC(warning(disable : 4146))
-
-namespace kfr
-{
-
-#if CMT_COMPILER_GNU
-constexpr double infinity = __builtin_inf();
-constexpr double qnan = __builtin_nan("");
-#else
-constexpr double infinity = HUGE_VAL;
-constexpr double qnan = NAN;
-#endif
-
-template <typename T>
-struct constants
-{
-public:
- using Tsub = subtype<T>;
-
- constexpr static Tsub pi_s(int m, int d = 1) { return pi * m / d; }
- constexpr static Tsub recip_pi_s(int m, int d = 1) { return recip_pi * m / d; }
-
- constexpr static Tsub pi = static_cast<Tsub>(3.1415926535897932384626433832795);
- constexpr static Tsub sqr_pi = static_cast<Tsub>(9.8696044010893586188344909998762);
- constexpr static Tsub recip_pi = static_cast<Tsub>(0.31830988618379067153776752674503);
- constexpr static Tsub degtorad = static_cast<Tsub>(pi / 180);
- constexpr static Tsub radtodeg = static_cast<Tsub>(pi * 180);
- constexpr static Tsub e = static_cast<Tsub>(2.718281828459045235360287471352662);
- constexpr static Tsub recip_log_2 = static_cast<Tsub>(1.442695040888963407359924681001892137426645954);
- constexpr static Tsub recip_log_10 = static_cast<Tsub>(0.43429448190325182765112891891661);
- constexpr static Tsub log_2 = static_cast<Tsub>(0.69314718055994530941723212145818);
- constexpr static Tsub log_10 = static_cast<Tsub>(2.3025850929940456840179914546844);
- constexpr static Tsub sqrt_2 = static_cast<Tsub>(1.4142135623730950488016887242097);
-
- constexpr static Tsub fold_constant_div = choose_const<Tsub>(
- CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01));
-
- constexpr static Tsub fold_constant_hi = choose_const<Tsub>(
- CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01));
- constexpr static Tsub fold_constant_rem1 =
- choose_const<Tsub>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f),
- CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08));
- constexpr static Tsub fold_constant_rem2 =
- choose_const<Tsub>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f),
- CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15));
-
- constexpr static Tsub epsilon = std::numeric_limits<Tsub>::epsilon();
- constexpr static Tsub infinity = std::numeric_limits<Tsub>::infinity();
- constexpr static Tsub neginfinity = -std::numeric_limits<Tsub>::infinity();
- constexpr static Tsub qnan = std::numeric_limits<Tsub>::quiet_NaN();
-
-#if CMT_COMPILER_GNU
-
- CMT_PRAGMA_GNU(GCC diagnostic push)
- CMT_PRAGMA_GNU(GCC diagnostic ignored "-Woverflow")
-
- constexpr static Tsub allones()
- {
- if (is_same<Tsub, f32>::value)
- {
- return -__builtin_nanf("0xFFFFFFFF");
- }
- else if (is_same<Tsub, f64>::value)
- {
- return -__builtin_nan("0xFFFFFFFFFFFFFFFF");
- }
- else
- {
- return static_cast<Tsub>(-1ll);
- }
- }
-
- constexpr static Tsub allzeros() { return Tsub(0); }
-
- constexpr static Tsub highbitmask()
- {
- if (is_same<Tsub, f32>::value)
- {
- return -0.0f;
- }
- else if (is_same<Tsub, f64>::value)
- {
- return -0.0;
- }
- else
- {
- return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1));
- }
- }
-
- constexpr static Tsub invhighbitmask()
- {
- if (is_same<Tsub, f32>::value)
- {
- return __builtin_nanf("0xFFFFFFFF");
- }
- else if (is_same<Tsub, f64>::value)
- {
- return __builtin_nan("0xFFFFFFFFFFFFFFFF");
- }
- else
- {
- return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1);
- }
- }
- CMT_PRAGMA_GNU(GCC diagnostic pop)
-#else
-
- static Tsub allones()
- {
- if (is_same<Tsub, f32>::value)
- {
- return static_cast<Tsub>(bitcast<f32>(0xFFFFFFFFu));
- }
- else if (is_same<Tsub, f64>::value)
- {
- return static_cast<Tsub>(bitcast<f64>(0xFFFFFFFFFFFFFFFFull));
- }
- else
- {
- return static_cast<Tsub>(-1ll);
- }
- }
-
- constexpr static Tsub allzeros() { return Tsub(0); }
-
- static Tsub highbitmask()
- {
- if (is_same<Tsub, f32>::value)
- {
- return static_cast<Tsub>(-0.0f);
- }
- else if (is_same<Tsub, f64>::value)
- {
- return static_cast<Tsub>(-0.0);
- }
- else
- {
- return static_cast<Tsub>(1ull << (sizeof(Tsub) * 8 - 1));
- }
- }
-
- static Tsub invhighbitmask()
- {
- if (is_same<Tsub, f32>::value)
- {
- return static_cast<Tsub>(bitcast<f32>(0x7FFFFFFFu));
- }
- else if (is_same<Tsub, f64>::value)
- {
- return static_cast<Tsub>(bitcast<f64>(0x7FFFFFFFFFFFFFFFull));
- }
- else
- {
- return static_cast<Tsub>((1ull << (sizeof(Tsub) * 8 - 1)) - 1);
- }
- }
-#endif
-};
-
-template <typename T>
-constexpr subtype<T> constants<T>::pi;
-template <typename T>
-constexpr subtype<T> constants<T>::sqr_pi;
-template <typename T>
-constexpr subtype<T> constants<T>::recip_pi;
-template <typename T>
-constexpr subtype<T> constants<T>::degtorad;
-template <typename T>
-constexpr subtype<T> constants<T>::radtodeg;
-template <typename T>
-constexpr subtype<T> constants<T>::e;
-template <typename T>
-constexpr subtype<T> constants<T>::recip_log_2;
-template <typename T>
-constexpr subtype<T> constants<T>::recip_log_10;
-template <typename T>
-constexpr subtype<T> constants<T>::log_2;
-template <typename T>
-constexpr subtype<T> constants<T>::log_10;
-template <typename T>
-constexpr subtype<T> constants<T>::sqrt_2;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_div;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_hi;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_rem1;
-template <typename T>
-constexpr subtype<T> constants<T>::fold_constant_rem2;
-template <typename T>
-constexpr subtype<T> constants<T>::epsilon;
-template <typename T>
-constexpr subtype<T> constants<T>::infinity;
-template <typename T>
-constexpr subtype<T> constants<T>::neginfinity;
-template <typename T>
-constexpr subtype<T> constants<T>::qnan;
-
-/// π (pi)
-/// c_pi<f64, 4> = 4pi
-/// c_pi<f64, 3, 4> = 3/4pi
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d);
-
-/// π² (pi²)
-/// c_sqr_pi<f64, 4> = 4pi²
-/// c_sqr_pi<f64, 3, 4> = 3/4pi²
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d);
-
-/// 1/Ï€ (1/pi)
-/// c_recip_pi<f64> 1/pi
-/// c_recip_pi<f64, 4> 4/pi
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d);
-
-/// degree to radian conversion factor
-template <typename T>
-constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>;
-
-/// radian to degree conversion factor
-template <typename T>
-constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>;
-
-/// e, Euler's number
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d);
-
-template <typename T>
-constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52;
-
-template <typename T>
-constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1;
-
-template <typename T>
-constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon());
-
-/// infinity
-template <typename T>
-constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity();
-
-/// -infinity
-template <typename T>
-constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity();
-
-/// Quiet NaN
-template <typename T>
-constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN();
-
-template <typename T>
-constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954);
-
-template <typename T>
-constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661);
-
-template <typename T>
-constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818);
-
-template <typename T>
-constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844);
-
-template <typename T, int m = 1, int d = 1>
-constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d);
-} // namespace kfr
-
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/conversion.hpp b/include/kfr/base/conversion.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup math
+/** @addtogroup conversion
* @{
*/
/*
@@ -25,12 +25,15 @@
*/
#pragma once
-#include "types.hpp"
+#include "../math/clamp.hpp"
+#include "../simd/types.hpp"
+#include "../simd/vec.hpp"
#include "univector.hpp"
-#include "vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
enum class audio_sample_type
{
@@ -179,7 +182,7 @@ template <typename Tout, typename Tin, typename Tout_traits = audio_sample_trait
inline Tout convert_sample(const Tin& in)
{
constexpr auto scale = Tout_traits::scale / Tin_traits::scale;
- return cast<Tout>(clamp(in * scale, -Tout_traits::scale, +Tout_traits::scale));
+ return innercast<Tout>(clamp(in * scale, -Tout_traits::scale, +Tout_traits::scale));
}
/// @brief Deinterleaves and converts audio samples
@@ -275,4 +278,5 @@ void convert(void* out, audio_sample_type out_type, const Tin* in, size_t size)
convert(reinterpret_cast<type*>(out), in, size);
});
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp
@@ -1,107 +0,0 @@
-/** @addtogroup shuffle
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "shuffle.hpp"
-#include "types.hpp"
-
-namespace kfr
-{
-
-namespace internal
-{
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
-
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
-
-template <u32 m, u32 shift, u32... values>
-constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
-{
- return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>());
-}
-
-template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>)
-{
- return bit_permute_step_impl(
- x,
- cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
- (32 - bits);
-}
-
-template <size_t bits>
-constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>)
-{
- return bit_permute_step_impl(
- x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
- (32 - bits);
-}
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-
-template <size_t radix, size_t bits>
-struct shuffle_index_digitreverse
-{
- constexpr inline size_t operator()(size_t index) const noexcept
- {
- return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>());
- }
-};
-} // namespace internal
-
-template <size_t radix, size_t group = 1, typename T, size_t N>
-CMT_INLINE vec<T, N> digitreverse(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(
- csizeseq_t<N / group>().map(internal::shuffle_index_digitreverse<radix, ilog2(N / group)>())));
-}
-
-template <size_t groupsize = 1, typename T, size_t N>
-CMT_INLINE vec<T, N> bitreverse(const vec<T, N>& x)
-{
- return digitreverse<2, groupsize>(x);
-}
-
-template <size_t groupsize = 1, typename T, size_t N>
-CMT_INLINE vec<T, N> digitreverse4(const vec<T, N>& x)
-{
- return digitreverse<4, groupsize>(x);
-}
-
-template <size_t bits>
-constexpr inline u32 bitreverse(u32 x)
-{
- return internal::digitreverse_impl<bits>(x, csize_t<2>());
-}
-
-template <size_t bits>
-constexpr inline u32 digitreverse4(u32 x)
-{
- return internal::digitreverse_impl<bits>(x, csize_t<4>());
-}
-} // namespace kfr
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -25,9 +25,10 @@
*/
#pragma once
-#include "platform.hpp"
-#include "types.hpp"
-#include "vec.hpp"
+#include "../simd/platform.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/types.hpp"
+#include "../simd/vec.hpp"
#include <tuple>
#ifdef KFR_STD_COMPLEX
@@ -36,9 +37,12 @@
CMT_PRAGMA_GNU(GCC diagnostic push)
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses")
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
constexpr size_t inout_context_size = 16;
@@ -73,20 +77,20 @@ struct complex;
constexpr size_t infinite_size = static_cast<size_t>(-1);
-CMT_INLINE constexpr size_t size_add(size_t x, size_t y)
+CMT_INTRINSIC constexpr size_t size_add(size_t x, size_t y)
{
return (x == infinite_size || y == infinite_size) ? infinite_size : x + y;
}
-CMT_INLINE constexpr size_t size_sub(size_t x, size_t y)
+CMT_INTRINSIC constexpr size_t size_sub(size_t x, size_t y)
{
return (x == infinite_size || y == infinite_size) ? infinite_size : (x > y ? x - y : 0);
}
-CMT_INLINE constexpr size_t size_min(size_t x) noexcept { return x; }
+CMT_INTRINSIC constexpr size_t size_min(size_t x) CMT_NOEXCEPT { return x; }
template <typename... Ts>
-CMT_INLINE constexpr size_t size_min(size_t x, size_t y, Ts... rest) noexcept
+CMT_INTRINSIC constexpr size_t size_min(size_t x, size_t y, Ts... rest) CMT_NOEXCEPT
{
return size_min(x < y ? x : y, rest...);
}
@@ -94,23 +98,23 @@ CMT_INLINE constexpr size_t size_min(size_t x, size_t y, Ts... rest) noexcept
/// @brief Base class of all input expressoins
struct input_expression
{
- CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; }
+ KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; }
constexpr static bool is_incremental = false;
- CMT_INLINE constexpr void begin_block(cinput_t, size_t) const {}
- CMT_INLINE constexpr void end_block(cinput_t, size_t) const {}
+ KFR_MEM_INTRINSIC constexpr void begin_block(cinput_t, size_t) const {}
+ KFR_MEM_INTRINSIC constexpr void end_block(cinput_t, size_t) const {}
};
/// @brief Base class of all output expressoins
struct output_expression
{
- CMT_INLINE constexpr static size_t size() noexcept { return infinite_size; }
+ KFR_MEM_INTRINSIC constexpr static size_t size() CMT_NOEXCEPT { return infinite_size; }
constexpr static bool is_incremental = false;
- CMT_INLINE constexpr void begin_block(coutput_t, size_t) const {}
- CMT_INLINE constexpr void end_block(coutput_t, size_t) const {}
+ KFR_MEM_INTRINSIC constexpr void begin_block(coutput_t, size_t) const {}
+ KFR_MEM_INTRINSIC constexpr void end_block(coutput_t, size_t) const {}
};
/// @brief Check if the type argument is an input expression
@@ -141,17 +145,14 @@ using is_numeric_args = and_t<is_numeric<Ts>...>;
namespace internal
{
template <typename T, size_t N, typename Fn>
-static vec<T, N> get_fn_value(size_t index, Fn&& fn)
+inline vec<T, N> get_fn_value(size_t index, Fn&& fn)
{
- vec<T, N> x;
- for (size_t i = 0; i < N; i++)
- x[i] = fn(index + i);
- return x;
+ return apply(fn, enumerate<size_t, N>() + index);
}
} // namespace internal
template <typename E, typename Fn>
-static void test_expression(const E& expr, size_t size, Fn&& fn, const char* expression = nullptr)
+void test_expression(const E& expr, size_t size, Fn&& fn, const char* expression = nullptr)
{
using T = value_type_of<E>;
::testo::test_case* test = ::testo::active_test();
@@ -159,38 +160,20 @@ static void test_expression(const E& expr, size_t size, Fn&& fn, const char* exp
test->check(c <= expr.size() == size, expression);
if (expr.size() != size)
return;
- size = size_min(size, 100);
+ size = size_min(size, 200);
+ constexpr size_t maxsize = 2 + ilog2(vector_width<T> * 2);
for (size_t i = 0; i < size;)
{
const size_t next_size =
- std::min(prev_poweroftwo(size - i), static_cast<size_t>(1) << (std::rand() % 6));
- switch (next_size)
- {
- case 1:
- test->check(c <= expr(cinput, i, vec_t<T, 1>()) == internal::get_fn_value<T, 1>(i, fn),
- expression);
- break;
- case 2:
- test->check(c <= expr(cinput, i, vec_t<T, 2>()) == internal::get_fn_value<T, 2>(i, fn),
- expression);
- break;
- case 4:
- test->check(c <= expr(cinput, i, vec_t<T, 4>()) == internal::get_fn_value<T, 4>(i, fn),
- expression);
- break;
- case 8:
- test->check(c <= expr(cinput, i, vec_t<T, 8>()) == internal::get_fn_value<T, 8>(i, fn),
- expression);
- break;
- case 16:
- test->check(c <= expr(cinput, i, vec_t<T, 16>()) == internal::get_fn_value<T, 16>(i, fn),
- expression);
- break;
- case 32:
- test->check(c <= expr(cinput, i, vec_t<T, 32>()) == internal::get_fn_value<T, 32>(i, fn),
+ std::min(prev_poweroftwo(size - i), static_cast<size_t>(1) << (std::rand() % maxsize));
+
+ cswitch(csize<1> << csizeseq<maxsize>, next_size, [&](auto x) {
+ constexpr size_t nsize = val_of(decltype(x)());
+ ::testo::scope s(as_string("i = ", i));
+ test->check(c <= get_elements(expr, cinput, i, vec_shape<T, nsize>()) ==
+ internal::get_fn_value<T, nsize>(i, fn),
expression);
- break;
- }
+ });
i += next_size;
}
}
@@ -208,33 +191,26 @@ template <typename T, typename Fn>
struct expression_lambda : input_expression
{
using value_type = T;
- CMT_INLINE expression_lambda(Fn&& fn) : fn(std::move(fn)) {}
+ KFR_MEM_INTRINSIC expression_lambda(Fn&& fn) : fn(std::move(fn)) {}
- template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_t<T, N>>::value)>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, cinput_t, size_t, vec_shape<T, N>>::value)>
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return fn(cinput, index, y);
+ return self.fn(cinput, index, y);
}
template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn, size_t>::value)>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t, size_t index,
+ vec_shape<T, N>)
{
- vec<T, N> result;
- for (size_t i = 0; i < N; i++)
- {
- result[i] = fn(index + i);
- }
- return result;
+ return apply(self.fn, enumerate<size_t, N>() + index);
}
template <size_t N, KFR_ENABLE_IF(N&& is_callable<Fn>::value)>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lambda& self, cinput_t, size_t,
+ vec_shape<T, N>)
{
- vec<T, N> result;
- for (size_t i = 0; i < N; i++)
- {
- result[i] = fn();
- }
- return result;
+ return apply<N>(self.fn);
}
Fn fn;
@@ -269,19 +245,22 @@ namespace internal
{
template <typename... Args>
-struct expression_base : input_expression
+struct expression_with_arguments : input_expression
{
- CMT_INLINE constexpr size_t size() const noexcept { return size_impl(indicesfor_t<Args...>()); }
+ KFR_MEM_INTRINSIC constexpr size_t size() const CMT_NOEXCEPT
+ {
+ return size_impl(indicesfor_t<Args...>());
+ }
constexpr static size_t count = sizeof...(Args);
- expression_base() = delete;
- constexpr expression_base(Args&&... args) noexcept : args(std::forward<Args>(args)...) {}
+ expression_with_arguments() = delete;
+ constexpr expression_with_arguments(Args&&... args) CMT_NOEXCEPT : args(std::forward<Args>(args)...) {}
- CMT_INLINE void begin_block(cinput_t cinput, size_t size) const
+ KFR_MEM_INTRINSIC void begin_block(cinput_t cinput, size_t size) const
{
begin_block_impl(cinput, size, indicesfor_t<Args...>());
}
- CMT_INLINE void end_block(cinput_t cinput, size_t size) const
+ KFR_MEM_INTRINSIC void end_block(cinput_t cinput, size_t size) const
{
end_block_impl(cinput, size, indicesfor_t<Args...>());
}
@@ -290,44 +269,48 @@ struct expression_base : input_expression
protected:
template <size_t... indices>
- CMT_INLINE constexpr size_t size_impl(csizes_t<indices...>) const noexcept
+ KFR_MEM_INTRINSIC constexpr size_t size_impl(csizes_t<indices...>) const CMT_NOEXCEPT
{
return size_min(std::get<indices>(this->args).size()...);
}
template <typename Fn, typename T, size_t N>
- CMT_INLINE vec<T, N> call(cinput_t cinput, Fn&& fn, size_t index, vec_t<T, N> x) const
+ KFR_MEM_INTRINSIC vec<T, N> call(cinput_t cinput, Fn&& fn, size_t index, vec_shape<T, N> x) const
{
return call_impl(cinput, std::forward<Fn>(fn), indicesfor_t<Args...>(), index, x);
}
template <size_t ArgIndex, typename U, size_t N,
typename T = value_type_of<typename details::get_nth_type<ArgIndex, Args...>::type>>
- CMT_INLINE vec<U, N> argument(cinput_t cinput, csize_t<ArgIndex>, size_t index, vec_t<U, N>) const
+ KFR_MEM_INTRINSIC vec<U, N> argument(cinput_t cinput, csize_t<ArgIndex>, size_t index,
+ vec_shape<U, N>) const
{
static_assert(ArgIndex < count, "Incorrect ArgIndex");
- return static_cast<vec<U, N>>(std::get<ArgIndex>(this->args)(cinput, index, vec_t<T, N>()));
+ return get_elements(
+ static_cast<vec<U, N>>(std::get<ArgIndex>(this->args), cinput, index, vec_shape<T, N>()));
}
template <typename U, size_t N,
typename T = value_type_of<typename details::get_nth_type<0, Args...>::type>>
- CMT_INLINE vec<U, N> argument_first(cinput_t cinput, size_t index, vec_t<U, N>) const
+ KFR_MEM_INTRINSIC vec<U, N> argument_first(cinput_t cinput, size_t index, vec_shape<U, N>) const
{
- return static_cast<vec<U, N>>(std::get<0>(this->args)(cinput, index, vec_t<T, N>()));
+ return static_cast<vec<U, N>>(
+ get_elements(std::get<0>(this->args), cinput, index, vec_shape<T, N>()));
}
private:
template <typename Fn, typename T, size_t N, size_t... indices>
- CMT_INLINE vec<T, N> call_impl(cinput_t cinput, Fn&& fn, csizes_t<indices...>, size_t index,
- vec_t<T, N>) const
+ KFR_MEM_INTRINSIC vec<T, N> call_impl(cinput_t cinput, Fn&& fn, csizes_t<indices...>, size_t index,
+ vec_shape<T, N>) const
{
- return fn(std::get<indices>(this->args)(cinput, index, vec_t<value_type_of<Args>, N>())...);
+ return fn(get_elements(std::get<indices>(this->args), cinput, index,
+ vec_shape<value_type_of<Args>, N>())...);
}
template <size_t... indices>
- CMT_INLINE void begin_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
+ KFR_MEM_INTRINSIC void begin_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
{
swallow{ (std::get<indices>(args).begin_block(cinput, size), 0)... };
}
template <size_t... indices>
- CMT_INLINE void end_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
+ KFR_MEM_INTRINSIC void end_block_impl(cinput_t cinput, size_t size, csizes_t<indices...>) const
{
swallow{ (std::get<indices>(args).end_block(cinput, size), 0)... };
}
@@ -338,14 +321,15 @@ struct expression_scalar : input_expression
{
using value_type = T;
expression_scalar() = delete;
- constexpr expression_scalar(const T& val) noexcept : val(val) {}
- constexpr expression_scalar(const vec<T, width>& val) noexcept : val(val) {}
+ constexpr expression_scalar(const T& val) CMT_NOEXCEPT : val(val) {}
+ constexpr expression_scalar(const vec<T, width>& val) CMT_NOEXCEPT : val(val) {}
vec<T, width> val;
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_scalar& self, cinput_t, size_t,
+ vec_shape<T, N>)
{
- return resize<N>(val);
+ return resize<N>(self.val);
}
};
@@ -377,27 +361,30 @@ template <typename T>
using arg = typename internal::arg_impl<decay<T>, T>::type;
template <typename Fn, typename... Args>
-struct expression_function : expression_base<arg<Args>...>
+struct expression_function : expression_with_arguments<arg<Args>...>
{
using value_type =
subtype<decltype(std::declval<Fn>()(std::declval<vec<value_type_of<arg<Args>>, 1>>()...))>;
using T = value_type;
- expression_function(Fn&& fn, arg<Args>&&... args) noexcept
- : expression_base<arg<Args>...>(std::forward<arg<Args>>(args)...), fn(std::forward<Fn>(fn))
+ expression_function(Fn&& fn, arg<Args>&&... args) CMT_NOEXCEPT
+ : expression_with_arguments<arg<Args>...>(std::forward<arg<Args>>(args)...),
+ fn(std::forward<Fn>(fn))
{
}
- expression_function(const Fn& fn, arg<Args>&&... args) noexcept
- : expression_base<arg<Args>...>(std::forward<arg<Args>>(args)...), fn(fn)
+ expression_function(const Fn& fn, arg<Args>&&... args) CMT_NOEXCEPT
+ : expression_with_arguments<arg<Args>...>(std::forward<arg<Args>>(args)...),
+ fn(fn)
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> x) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_function& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> x)
{
- return this->call(cinput, fn, index, x);
+ return self.call(cinput, self.fn, index, x);
}
- const Fn& get_fn() const noexcept { return fn; }
+ const Fn& get_fn() const CMT_NOEXCEPT { return fn; }
protected:
Fn fn;
@@ -405,25 +392,25 @@ protected:
} // namespace internal
template <typename A>
-CMT_INLINE internal::arg<A> e(A&& a)
+CMT_INTRINSIC internal::arg<A> e(A&& a)
{
return internal::arg<A>(std::forward<A>(a));
}
template <typename T>
-CMT_INLINE internal::expression_scalar<T> scalar(const T& val)
+CMT_INTRINSIC internal::expression_scalar<T> scalar(const T& val)
{
return internal::expression_scalar<T>(val);
}
template <typename T, size_t N>
-CMT_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
+CMT_INTRINSIC internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
{
return internal::expression_scalar<T, N>(val);
}
template <typename Fn, typename... Args>
-CMT_INLINE internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&& fn, Args&&... args)
+CMT_INTRINSIC internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&& fn, Args&&... args)
{
return internal::expression_function<decay<Fn>, Args...>(std::forward<Fn>(fn),
std::forward<Args>(args)...);
@@ -434,17 +421,16 @@ CMT_INLINE internal::expression_function<decay<Fn>, Args...> bind_expression(Fn&
* @param args new arguments for the function
*/
template <typename Fn, typename... OldArgs, typename... NewArgs>
-CMT_INLINE internal::expression_function<Fn, NewArgs...> rebind(
+CMT_INTRINSIC internal::expression_function<Fn, NewArgs...> rebind(
const internal::expression_function<Fn, OldArgs...>& e, NewArgs&&... args)
{
return internal::expression_function<Fn, NewArgs...>(e.get_fn(), std::forward<NewArgs>(args)...);
}
-template <cpu_t c = cpu_t::native, size_t width = 0, typename OutputExpr, typename InputExpr,
- size_t groupsize = 1>
-CMT_INLINE static size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 0,
- size_t size = infinite_size, coutput_t coutput = nullptr,
- cinput_t cinput = nullptr, csize_t<groupsize> = csize_t<groupsize>())
+template <size_t width = 0, typename OutputExpr, typename InputExpr, size_t groupsize = 1>
+CMT_INTRINSIC static size_t process(OutputExpr&& out, const InputExpr& in, size_t start = 0,
+ size_t size = infinite_size, coutput_t coutput = nullptr,
+ cinput_t cinput = nullptr, csize_t<groupsize> = csize_t<groupsize>())
{
using Tin = value_type_of<InputExpr>;
static_assert(is_output_expression<OutputExpr>::value, "OutFn must be an expression");
@@ -453,24 +439,25 @@ CMT_INLINE static size_t process(OutputExpr&& out, const InputExpr& in, size_t s
size = size_sub(size_min(out.size(), in.size(), size_add(size, start)), start);
if (size == 0 || size == infinite_size)
return size;
- const size_t end = start + size;
out.begin_block(coutput, size);
in.begin_block(cinput, size);
#ifdef NDEBUG
- constexpr size_t w = width == 0 ? platform<Tin, c>::vector_capacity / 4 : width;
+ constexpr size_t w = width == 0 ? maximum_vector_size<Tin> : width;
#else
- constexpr size_t w = width == 0 ? platform<Tin, c>::vector_width : width;
+ constexpr size_t w = width == 0 ? vector_width<Tin> : width;
#endif
+ static_assert(w > 0 && is_poweroftwo(w), "");
+
size_t i = start;
CMT_LOOP_NOUNROLL
for (; i < start + size / w * w; i += w)
- out(coutput, i, in(cinput, i, vec_t<Tin, w>()));
+ out(coutput, i, get_elements(in, cinput, i, vec_shape<Tin, w>()));
CMT_LOOP_NOUNROLL
for (; i < start + size / groupsize * groupsize; i += groupsize)
- out(coutput, i, in(cinput, i, vec_t<Tin, groupsize>()));
+ out(coutput, i, get_elements(in, cinput, i, vec_shape<Tin, groupsize>()));
in.end_block(cinput, size);
out.end_block(coutput, size);
@@ -483,11 +470,12 @@ struct input_expression_base : input_expression
virtual ~input_expression_base() {}
virtual T input(size_t index) const = 0;
template <typename U, size_t N>
- CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+ friend KFR_INTRINSIC vec<U, N> get_elements(const input_expression_base& self, cinput_t, size_t index,
+ vec_shape<U, N>)
{
vec<U, N> out;
for (size_t i = 0; i < N; i++)
- out[i] = static_cast<U>(input(index + i));
+ out[i] = static_cast<U>(self.input(index + i));
return out;
}
};
@@ -499,12 +487,19 @@ struct output_expression_base : output_expression
virtual void output(size_t index, const T& value) = 0;
template <typename U, size_t N>
- CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
+ KFR_MEM_INTRINSIC void operator()(coutput_t, size_t index, const vec<U, N>& value)
{
for (size_t i = 0; i < N; i++)
output(index + i, static_cast<T>(value[i]));
}
};
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+CMT_INTRINSIC internal::expression_function<fn::interleave, E1, E2> interleave(E1&& x, E2&& y)
+{
+ return { fn::interleave(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/filter.hpp b/include/kfr/base/filter.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup math
+/** @addtogroup filter
* @{
*/
/*
@@ -32,6 +32,8 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
/// @brief Abstract base class for filters with one argument. Mainly for DSP
template <typename T>
@@ -131,16 +133,17 @@ protected:
/// @brief Converts expression with placeholder to filter. Placeholder and filter must have the same type
template <typename E, typename T = value_type_of<E>>
-KFR_SINTRIN expression_filter<T> to_filter(E&& e)
+KFR_INTRINSIC expression_filter<T> to_filter(E&& e)
{
return expression_filter<T>(to_pointer(std::move(e)));
}
/// @brief Converts expression with placeholder to filter. Placeholder and filter must have the same type
template <typename T, typename E>
-KFR_SINTRIN expression_filter<T> to_filter(expression_pointer<T>&& e)
+KFR_INTRINSIC expression_filter<T> to_filter(expression_pointer<T>&& e)
{
return expression_filter<T>(std::move(e));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/fraction.hpp b/include/kfr/base/fraction.hpp
@@ -25,8 +25,7 @@
*/
#pragma once
-#include "operators.hpp"
-#include "vec.hpp"
+#include "../simd/types.hpp"
namespace kfr
{
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -1,268 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "expression.hpp"
-#include "shuffle.hpp"
-#include "types.hpp"
-#include "vec.hpp"
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
-
-namespace kfr
-{
-
-#define KFR_I_CONVERTER(fn) \
- template <typename T1, typename... Args, typename Tout = ::cometa::common_type<T1, Args...>> \
- KFR_SINTRIN Tout fn(const T1& a, const Args&... b) \
- { \
- using vecout = vec1<Tout>; \
- return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \
- }
-
-#define KFR_I_FLT_CONVERTER(fn) \
- template <typename T1, typename... Args, \
- typename Tout = ::kfr::flt_type<::cometa::common_type<T1, Args...>>> \
- KFR_SINTRIN Tout fn(const T1& a, const Args&... b) \
- { \
- using vecout = vec1<Tout>; \
- return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \
- }
-
-namespace intrinsics
-{
-#ifdef CMT_ARCH_X86
-using f32sse = vec<f32, 4>;
-using f64sse = vec<f64, 2>;
-using i8sse = vec<i8, 16>;
-using i16sse = vec<i16, 8>;
-using i32sse = vec<i32, 4>;
-using i64sse = vec<i64, 2>;
-using u8sse = vec<u8, 16>;
-using u16sse = vec<u16, 8>;
-using u32sse = vec<u32, 4>;
-using u64sse = vec<u64, 2>;
-
-using f32avx = vec<f32, 8>;
-using f64avx = vec<f64, 4>;
-using i8avx = vec<i8, 32>;
-using i16avx = vec<i16, 16>;
-using i32avx = vec<i32, 8>;
-using i64avx = vec<i64, 4>;
-using u8avx = vec<u8, 32>;
-using u16avx = vec<u16, 16>;
-using u32avx = vec<u32, 8>;
-using u64avx = vec<u64, 4>;
-
-using f32avx512 = vec<f32, 16>;
-using f64avx512 = vec<f64, 8>;
-using i8avx512 = vec<i8, 64>;
-using i16avx512 = vec<i16, 32>;
-using i32avx512 = vec<i32, 16>;
-using i64avx512 = vec<i64, 8>;
-using u8avx512 = vec<u8, 64>;
-using u16avx512 = vec<u16, 32>;
-using u32avx512 = vec<u32, 16>;
-using u64avx512 = vec<u64, 8>;
-
-#else
-using f32neon = vec<f32, 4>;
-using f64neon = vec<f64, 2>;
-using i8neon = vec<i8, 16>;
-using i16neon = vec<i16, 8>;
-using i32neon = vec<i32, 4>;
-using i64neon = vec<i64, 2>;
-using u8neon = vec<u8, 16>;
-using u16neon = vec<u16, 8>;
-using u32neon = vec<u32, 4>;
-using u64neon = vec<u64, 2>;
-#endif
-
-template <cpu_t c, typename T>
-constexpr inline size_t next_simd_width(size_t n)
-{
-#ifdef CMT_ARCH_X86
- return n > platform<T, cpu_t::sse2>::vector_width ? platform<T, c>::vector_width
- : platform<T, cpu_t::sse2>::vector_width;
-#endif
-#ifdef CMT_ARCH_ARM
- return platform<T, cpu_t::neon>::vector_width;
-#endif
-}
-
-template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
-KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x)
-{
- return extend<Nout>(x);
-}
-
-template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
-KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value)
-{
- return widen<Nout>(x, value);
-}
-
-#define KFR_HANDLE_ALL_SIZES_1(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return slice<0, N>(fn(expand_simd(a))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return concat(fn(low(a)), fn(high(a))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_FLT_1(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \
- KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \
- { \
- return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a)))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \
- KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \
- { \
- return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a)))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_F_1(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_f_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return slice<0, N>(fn(expand_simd(a))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_f_class<T>::value), \
- typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return concat(fn(low(a)), fn(high(a))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_I_1(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_i_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return slice<0, N>(fn(expand_simd(a))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_i_class<T>::value), \
- typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return concat(fn(low(a)), fn(high(a))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_U_1(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && is_u_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return slice<0, N>(fn(expand_simd(a))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && is_u_class<T>::value), \
- typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return concat(fn(low(a)), fn(high(a))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width && !is_f_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return slice<0, N>(fn(expand_simd(a))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width && !is_f_class<T>::value), \
- typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
- { \
- return concat(fn(low(a)), fn(high(a))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_2(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \
- { \
- return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \
- { \
- return concat(fn(low(a), low(b)), fn(high(a), high(b))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_2_INT(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, int b) \
- { \
- return slice<0, N>(fn(expand_simd(a), b)); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, int b) \
- { \
- return concat(fn(low(a), b), fn(high(a), b)); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_3(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \
- { \
- return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \
- { \
- return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c))); \
- }
-
-#define KFR_HANDLE_ALL_SIZES_4(fn) \
- template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
- { \
- return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d))); \
- } \
- template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void> \
- KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
- { \
- return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d))); \
- }
-
-template <typename T>
-using vec1 = conditional<is_vec<T>::value, T, vec<T, 1>>;
-
-template <typename T>
-inline T to_scalar(const T& value)
-{
- return value;
-}
-template <typename T>
-inline T to_scalar(const vec<T, 1>& value)
-{
- return value[0];
-}
-} // namespace intrinsics
-} // namespace kfr
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/function_expressions.hpp b/include/kfr/base/function_expressions.hpp
@@ -0,0 +1,30 @@
+/** @addtogroup expressions
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+namespace kfr
+{
+} // namespace kfr
diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp
@@ -1,60 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/gamma.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns the approximate gamma function of an argument
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> gamma(const T1& x)
-{
- return intrinsics::gamma(x);
-}
-
-/// @brief Creates expression that returns the approximate gamma function of an argument
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::gamma, E1> gamma(E1&& x)
-{
- return { fn::gamma(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the approximate factorial of an argument
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> factorial_approx(const T1& x)
-{
- return intrinsics::factorial_approx(x);
-}
-
-/// @brief Creates expression that returns the approximate factorial of an argument
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::factorial_approx, E1> factorial_approx(E1&& x)
-{
- return { fn::factorial_approx(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup expressions
+/** @addtogroup generators
* @{
*/
/*
@@ -25,14 +25,16 @@
*/
#pragma once
-#include "function.hpp"
-#include "log_exp.hpp"
-#include "select.hpp"
-#include "sin_cos.hpp"
-#include "vec.hpp"
+#include "../math/log_exp.hpp"
+#include "../math/select.hpp"
+#include "../math/sin_cos.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
namespace internal
{
@@ -41,14 +43,15 @@ template <typename T, size_t width_, typename Class>
struct generator : input_expression
{
constexpr static size_t width = width_;
- using value_type = T;
+ using value_type = T;
constexpr static bool is_incremental = true;
template <typename U, size_t N>
- CMT_INLINE vec<U, N> operator()(cinput_t, size_t, vec_t<U, N> t) const
+ friend KFR_INTRINSIC vec<U, N> get_elements(const generator& self, cinput_t, size_t,
+ vec_shape<U, N> t)
{
- return generate(t);
+ return self.generate(t);
}
void resync(T start) const { ptr_cast<Class>(this)->sync(start); }
@@ -70,7 +73,7 @@ protected:
}
template <size_t N, KFR_ENABLE_IF(N == width)>
- CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
+ KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
{
const vec<T, N> result = value;
call_next();
@@ -78,7 +81,7 @@ protected:
}
template <size_t N, KFR_ENABLE_IF(N < width)>
- CMT_INLINE vec<T, N> generate(vec_t<T, N>) const
+ KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N>) const
{
const vec<T, N> result = narrow<N>(value);
shift(csize_t<N>());
@@ -86,7 +89,7 @@ protected:
}
template <size_t N, KFR_ENABLE_IF(N > width)>
- CMT_INLINE vec<T, N> generate(vec_t<T, N> x) const
+ KFR_MEM_INTRINSIC vec<T, N> generate(vec_shape<T, N> x) const
{
const auto lo = generate(low(x));
const auto hi = generate(high(x));
@@ -96,58 +99,64 @@ protected:
mutable vec<T, width> value;
};
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2)>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
struct generator_linear : generator<T, width, generator_linear<T, width>>
{
- constexpr generator_linear(T start, T step) noexcept : step(step), vstep(step * width)
+ generator_linear(T start, T step) CMT_NOEXCEPT : step(step), vstep(step* width) { this->resync(start); }
+
+ KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
{
- this->resync(start);
+ this->value = start + enumerate<T, width>() * step;
}
- CMT_INLINE void sync(T start) const noexcept { this->value = start + enumerate<T, width>() * step; }
-
- CMT_INLINE void next() const noexcept { this->value += vstep; }
+ KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += vstep; }
protected:
T step;
T vstep;
};
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP>
struct generator_exp : generator<T, width, generator_exp<T, width>>
{
- generator_exp(T start, T step) noexcept : step(step), vstep(exp(make_vector(step * width))[0] - 1)
+ generator_exp(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp(make_vector(step* width))[0] - 1)
{
this->resync(start);
}
- CMT_INLINE void sync(T start) const noexcept { this->value = exp(start + enumerate<T, width>() * step); }
+ KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
+ {
+ this->value = exp(start + enumerate<T, width>() * step);
+ }
- CMT_INLINE void next() const noexcept { this->value += this->value * vstep; }
+ KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
protected:
T step;
T vstep;
};
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP>
struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
{
- generator_exp2(T start, T step) noexcept : step(step), vstep(exp2(make_vector(step * width))[0] - 1)
+ generator_exp2(T start, T step) CMT_NOEXCEPT : step(step), vstep(exp2(make_vector(step* width))[0] - 1)
{
this->resync(start);
}
- CMT_INLINE void sync(T start) const noexcept { this->value = exp2(start + enumerate<T, width>() * step); }
+ KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
+ {
+ this->value = exp2(start + enumerate<T, width>() * step);
+ }
- CMT_INLINE void next() const noexcept { this->value += this->value * vstep; }
+ KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT { this->value += this->value * vstep; }
protected:
T step;
T vstep;
};
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(1, 2), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2), KFR_ARCH_DEP>
struct generator_cossin : generator<T, width, generator_cossin<T, width>>
{
generator_cossin(T start, T step)
@@ -155,9 +164,9 @@ struct generator_cossin : generator<T, width, generator_cossin<T, width>>
{
this->resync(start);
}
- CMT_INLINE void sync(T start) const noexcept { this->value = init_cossin(step, start); }
+ KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { this->value = init_cossin(step, start); }
- CMT_INLINE void next() const noexcept
+ KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
{
this->value = this->value - subadd(alpha * this->value, beta * swap<2>(this->value));
}
@@ -172,7 +181,7 @@ protected:
}
};
-template <typename T, size_t width = platform<T>::vector_width* bitness_const(2, 4), KFR_ARCH_DEP>
+template <typename T, size_t width = vector_width<T>* bitness_const(2, 4), KFR_ARCH_DEP>
struct generator_sin : generator<T, width, generator_sin<T, width>>
{
generator_sin(T start, T step)
@@ -180,14 +189,14 @@ struct generator_sin : generator<T, width, generator_sin<T, width>>
{
this->resync(start);
}
- CMT_INLINE void sync(T start) const noexcept
+ KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT
{
const vec<T, width* 2> cs = splitpairs(cossin(dup(start + enumerate<T, width>() * step)));
this->cos_value = low(cs);
this->value = high(cs);
}
- CMT_INLINE void next() const noexcept
+ KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
{
const vec<T, width> c = this->cos_value;
const vec<T, width> s = this->value;
@@ -200,7 +209,7 @@ struct generator_sin : generator<T, width, generator_sin<T, width>>
}
template <size_t N>
- void shift(csize_t<N>) const noexcept
+ void shift(csize_t<N>) const CMT_NOEXCEPT
{
const vec<T, width> oldvalue = this->value;
const vec<T, width> oldcosvalue = this->cos_value;
@@ -226,7 +235,7 @@ protected:
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_linear<TF> gen_linear(T1 start, T2 step)
+KFR_FUNCTION internal::generator_linear<TF> gen_linear(T1 start, T2 step)
{
return internal::generator_linear<TF>(start, step);
}
@@ -238,7 +247,7 @@ KFR_SINTRIN internal::generator_linear<TF> gen_linear(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_exp<TF> gen_exp(T1 start, T2 step)
+KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
{
return internal::generator_exp<TF>(start, step);
}
@@ -250,7 +259,7 @@ KFR_SINTRIN internal::generator_exp<TF> gen_exp(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
+KFR_FUNCTION internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
{
return internal::generator_exp2<TF>(start, step);
}
@@ -266,7 +275,7 @@ KFR_SINTRIN internal::generator_exp2<TF> gen_exp2(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
+KFR_FUNCTION internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
{
return internal::generator_cossin<TF>(start, step);
}
@@ -278,8 +287,9 @@ KFR_SINTRIN internal::generator_cossin<TF> gen_cossin(T1 start, T2 step)
\f]
*/
template <typename T1, typename T2, typename TF = ftype<common_type<T1, T2>>>
-KFR_SINTRIN internal::generator_sin<TF> gen_sin(T1 start, T2 step)
+KFR_FUNCTION internal::generator_sin<TF> gen_sin(T1 start, T2 step)
{
return internal::generator_sin<TF>(start, step);
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/horizontal.hpp b/include/kfr/base/horizontal.hpp
@@ -1,119 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "operators.hpp"
-
-namespace kfr
-{
-
-namespace internal
-{
-
-template <typename T, typename ReduceFn>
-CMT_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
-{
- return T(value[0]);
-}
-
-template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))>
-CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
-{
- return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce));
-}
-template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))>
-CMT_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
-{
- const T initial = reduce(initialvalue<T>());
- return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce));
-}
-} // namespace internal
-
-template <typename T, size_t N, typename ReduceFn>
-CMT_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
-{
- return internal::horizontal_impl(value, std::forward<ReduceFn>(reduce));
-}
-
-/// @brief Sum all elements of the vector
-template <typename T, size_t N>
-CMT_INLINE T hadd(const vec<T, N>& value)
-{
- return horizontal(value, fn::add());
-}
-KFR_FN(hadd)
-
-/// @brief Multiply all elements of the vector
-template <typename T, size_t N>
-CMT_INLINE T hmul(const vec<T, N>& value)
-{
- return horizontal(value, fn::mul());
-}
-KFR_FN(hmul)
-
-template <typename T, size_t N>
-CMT_INLINE T hbitwiseand(const vec<T, N>& value)
-{
- return horizontal(value, fn::bitwiseand());
-}
-KFR_FN(hbitwiseand)
-template <typename T, size_t N>
-CMT_INLINE T hbitwiseor(const vec<T, N>& value)
-{
- return horizontal(value, fn::bitwiseor());
-}
-KFR_FN(hbitwiseor)
-template <typename T, size_t N>
-CMT_INLINE T hbitwisexor(const vec<T, N>& value)
-{
- return horizontal(value, fn::bitwisexor());
-}
-KFR_FN(hbitwisexor)
-
-/// @brief Calculate the Dot-Product of two vectors
-template <typename T, size_t N>
-CMT_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y)
-{
- return hadd(x * y);
-}
-KFR_FN(dot)
-
-/// @brief Calculate the Arithmetic mean of all elements in the vector
-template <typename T, size_t N>
-CMT_INLINE T avg(const vec<T, N>& value)
-{
- return hadd(value) / N;
-}
-KFR_FN(avg)
-
-/// @brief Calculate the RMS of all elements in the vector
-template <typename T, size_t N>
-CMT_INLINE T rms(const vec<T, N>& value)
-{
- return internal::builtin_sqrt(hadd(value * value) / N);
-}
-KFR_FN(rms)
-} // namespace kfr
diff --git a/include/kfr/base/hyperbolic.hpp b/include/kfr/base/hyperbolic.hpp
@@ -1,120 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/hyperbolic.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns the hyperbolic sine of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sinh(const T1& x)
-{
- return intrinsics::sinh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic sine of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinh, E1> sinh(E1&& x)
-{
- return { fn::sinh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cosine of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cosh(const T1& x)
-{
- return intrinsics::cosh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cosine of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cosh, E1> cosh(E1&& x)
-{
- return { fn::cosh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic tangent of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> tanh(const T1& x)
-{
- return intrinsics::tanh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic tangent of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::tanh, E1> tanh(E1&& x)
-{
- return { fn::tanh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cotangent of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> coth(const T1& x)
-{
- return intrinsics::coth(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cotangent of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::coth, E1> coth(E1&& x)
-{
- return { fn::coth(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic sine of the even elements of the x and the hyperbolic cosine of the odd
-/// elements of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sinhcosh(const T1& x)
-{
- return intrinsics::sinhcosh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic sine of the even elements of the x and the
-/// hyperbolic cosine of the odd elements of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinhcosh, E1> sinhcosh(E1&& x)
-{
- return { fn::sinhcosh(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the hyperbolic cosine of the even elements of the x and the hyperbolic sine of the odd
-/// elements of the x
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> coshsinh(const T1& x)
-{
- return intrinsics::coshsinh(x);
-}
-
-/// @brief Returns template expression that returns the hyperbolic cosine of the even elements of the x and
-/// the hyperbolic sine of the odd elements of the x
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::coshsinh, E1> coshsinh(E1&& x)
-{
- return { fn::coshsinh(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/impl/abs.hpp b/include/kfr/base/impl/abs.hpp
@@ -1,126 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
-{
- return x & constants<T>::invhighbitmask();
-}
-
-KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); }
-KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); }
-KFR_SINTRIN i8sse abs(const i8sse& x) { return _mm_abs_epi8(*x); }
-KFR_SINTRIN u64sse abs(const u64sse& x) { return x; }
-KFR_SINTRIN u32sse abs(const u32sse& x) { return x; }
-KFR_SINTRIN u16sse abs(const u16sse& x) { return x; }
-KFR_SINTRIN u8sse abs(const u8sse& x) { return x; }
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN i64avx abs(const i64avx& x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32avx abs(const i32avx& x) { return _mm256_abs_epi32(*x); }
-KFR_SINTRIN i16avx abs(const i16avx& x) { return _mm256_abs_epi16(*x); }
-KFR_SINTRIN i8avx abs(const i8avx& x) { return _mm256_abs_epi8(*x); }
-KFR_SINTRIN u64avx abs(const u64avx& x) { return x; }
-KFR_SINTRIN u32avx abs(const u32avx& x) { return x; }
-KFR_SINTRIN u16avx abs(const u16avx& x) { return x; }
-KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN i64avx512 abs(const i64avx512& x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32avx512 abs(const i32avx512& x) { return _mm512_abs_epi32(*x); }
-KFR_SINTRIN i16avx512 abs(const i16avx512& x) { return _mm512_abs_epi16(*x); }
-KFR_SINTRIN i8avx512 abs(const i8avx512& x) { return _mm512_abs_epi8(*x); }
-KFR_SINTRIN u64avx512 abs(const u64avx512& x) { return x; }
-KFR_SINTRIN u32avx512 abs(const u32avx512& x) { return x; }
-KFR_SINTRIN u16avx512 abs(const u16avx512& x) { return x; }
-KFR_SINTRIN u8avx512 abs(const u8avx512& x) { return x; }
-#endif
-
-KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); }
-KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); }
-KFR_SINTRIN i32neon abs(const i32neon& x) { return vabsq_s32(*x); }
-#if defined CMT_ARCH_NEON64
-KFR_SINTRIN i64neon abs(const i64neon& x) { return vabsq_s64(*x); }
-#else
-KFR_SINTRIN i64neon abs(const i64neon& x) { return select(x >= 0, x, -x); }
-#endif
-
-KFR_SINTRIN u8neon abs(const u8neon& x) { return x; }
-KFR_SINTRIN u16neon abs(const u16neon& x) { return x; }
-KFR_SINTRIN u32neon abs(const u32neon& x) { return x; }
-KFR_SINTRIN u64neon abs(const u64neon& x) { return x; }
-
-KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); }
-#if defined CMT_ARCH_NEON64
-KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); }
-#else
-KFR_SINTRIN f64neon abs(const f64neon& x) { return x & constants<f64>::invhighbitmask(); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_1(abs)
-
-#else
-
-// floating point
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
-{
- return x & constants<T>::invhighbitmask();
-}
-
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
-{
- return select(x >= T(0), x, -x);
-}
-#endif
-KFR_I_CONVERTER(abs)
-} // namespace intrinsics
-
-KFR_I_FN(abs)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/asin_acos.hpp b/include/kfr/base/impl/asin_acos.hpp
@@ -1,58 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../atan.hpp"
-#include "../function.hpp"
-#include "../select.hpp"
-#include "../sqrt.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x)
-{
- const vec<Tout, N> xx = x;
- return atan2(xx, sqrt(Tout(1) - xx * xx));
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x)
-{
- const vec<Tout, N> xx = x;
- return -atan2(xx, sqrt(Tout(1) - xx * xx)) + constants<Tout>::pi * 0.5;
-}
-KFR_I_FLT_CONVERTER(asin)
-KFR_I_FLT_CONVERTER(acos)
-} // namespace intrinsics
-KFR_I_FN(asin)
-KFR_I_FN(acos)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/atan.hpp b/include/kfr/base/impl/atan.hpp
@@ -1,229 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-#include "../sin_cos.hpp"
-
-namespace kfr
-{
-namespace intrinsics
-{
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx)
-{
- vec<f32, N> x = xx, y = yy;
- vec<f32, N> s, t, u;
- vec<i32, N> q;
- q = select(x < 0, -2, 0);
- x = select(x < 0, -x, x);
- mask<i32, N> m;
- m = y > x;
- t = x;
- x = select(m, y, x);
- y = select(m, -t, y);
- q = select(m, q + 1, q);
- s = y / x;
- t = s * s;
- u = 0.00282363896258175373077393f;
- u = fmadd(u, t, -0.0159569028764963150024414f);
- u = fmadd(u, t, 0.0425049886107444763183594f);
- u = fmadd(u, t, -0.0748900920152664184570312f);
- u = fmadd(u, t, 0.106347933411598205566406f);
- u = fmadd(u, t, -0.142027363181114196777344f);
- u = fmadd(u, t, 0.199926957488059997558594f);
- u = fmadd(u, t, -0.333331018686294555664062f);
- t = u * t * s + s;
- t = cast<f32>(q) * 1.5707963267948966192313216916398f + t;
- return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx)
-{
- vec<f64, N> x = xx, y = yy;
- vec<f64, N> s, t, u;
- vec<i64, N> q;
- q = select(x < 0, i64(-2), i64(0));
- x = select(x < 0, -x, x);
- mask<i64, N> m;
- m = y > x;
- t = x;
- x = select(m, y, x);
- y = select(m, -t, y);
- q = select(m, q + i64(1), q);
- s = y / x;
- t = s * s;
- u = -1.88796008463073496563746e-05;
- u = fmadd(u, t, 0.000209850076645816976906797);
- u = fmadd(u, t, -0.00110611831486672482563471);
- u = fmadd(u, t, 0.00370026744188713119232403);
- u = fmadd(u, t, -0.00889896195887655491740809);
- u = fmadd(u, t, 0.016599329773529201970117);
- u = fmadd(u, t, -0.0254517624932312641616861);
- u = fmadd(u, t, 0.0337852580001353069993897);
- u = fmadd(u, t, -0.0407629191276836500001934);
- u = fmadd(u, t, 0.0466667150077840625632675);
- u = fmadd(u, t, -0.0523674852303482457616113);
- u = fmadd(u, t, 0.0587666392926673580854313);
- u = fmadd(u, t, -0.0666573579361080525984562);
- u = fmadd(u, t, 0.0769219538311769618355029);
- u = fmadd(u, t, -0.090908995008245008229153);
- u = fmadd(u, t, 0.111111105648261418443745);
- u = fmadd(u, t, -0.14285714266771329383765);
- u = fmadd(u, t, 0.199999999996591265594148);
- u = fmadd(u, t, -0.333333333333311110369124);
- t = u * t * s + s;
- t = cast<f64>(q) * 1.5707963267948966192313216916398 + t;
- return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
-{
- vec<f32, N> r = atan2k(abs(y), x);
- constexpr f32 pi = 3.1415926535897932384626433832795f;
- constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f;
- constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f;
- r = mulsign(r, x);
- r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
- r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
- r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r);
- r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
- return r;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
-{
- vec<f64, N> r = atan2k(abs(y), x);
- constexpr f64 pi = 3.1415926535897932384626433832795;
- constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
- constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
- r = mulsign(r, x);
- r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
- r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
- r = select(y == 0.0, select(x < 0., pi, 0.), r);
- r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
- return r;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan(const vec<f32, N>& x)
-{
- vec<f32, N> t, u;
- vec<i32, N> q;
- q = select(x < 0.f, 2, 0);
- vec<f32, N> s = select(x < 0.f, -x, x);
- q = select(s > 1.f, q | 1, q);
- s = select(s > 1.f, 1.0f / s, s);
- t = s * s;
- u = 0.00282363896258175373077393f;
- u = fmadd(u, t, -0.0159569028764963150024414f);
- u = fmadd(u, t, 0.0425049886107444763183594f);
- u = fmadd(u, t, -0.0748900920152664184570312f);
- u = fmadd(u, t, 0.106347933411598205566406f);
- u = fmadd(u, t, -0.142027363181114196777344f);
- u = fmadd(u, t, 0.199926957488059997558594f);
- u = fmadd(u, t, -0.333331018686294555664062f);
- t = s + s * (t * u);
- t = select((q & 1) != 0, 1.570796326794896557998982f - t, t);
- t = select((q & 2) != 0, -t, t);
- return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan(const vec<f64, N>& x)
-{
- vec<f64, N> t, u;
- vec<i64, N> q;
- q = select(x < 0.0, i64(2), i64(0));
- vec<f64, N> s = select(x < 0.0, -x, x);
- q = select(s > 1.0, q | 1, q);
- s = select(s > 1.0, 1.0 / s, s);
- t = s * s;
- u = -1.88796008463073496563746e-05;
- u = fmadd(u, t, 0.000209850076645816976906797);
- u = fmadd(u, t, -0.00110611831486672482563471);
- u = fmadd(u, t, 0.00370026744188713119232403);
- u = fmadd(u, t, -0.00889896195887655491740809);
- u = fmadd(u, t, 0.016599329773529201970117);
- u = fmadd(u, t, -0.0254517624932312641616861);
- u = fmadd(u, t, 0.0337852580001353069993897);
- u = fmadd(u, t, -0.0407629191276836500001934);
- u = fmadd(u, t, 0.0466667150077840625632675);
- u = fmadd(u, t, -0.0523674852303482457616113);
- u = fmadd(u, t, 0.0587666392926673580854313);
- u = fmadd(u, t, -0.0666573579361080525984562);
- u = fmadd(u, t, 0.0769219538311769618355029);
- u = fmadd(u, t, -0.090908995008245008229153);
- u = fmadd(u, t, 0.111111105648261418443745);
- u = fmadd(u, t, -0.14285714266771329383765);
- u = fmadd(u, t, 0.199999999996591265594148);
- u = fmadd(u, t, -0.333333333333311110369124);
- t = s + s * (t * u);
- t = select((q & 1) != 0, 1.570796326794896557998982 - t, t);
- t = select((q & 2) != 0, -t, t);
- return t;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atandeg(const vec<f32, N>& x)
-{
- return atan(x) * c_radtodeg<f32>;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atandeg(const vec<f64, N>& x)
-{
- return atan(x) * c_radtodeg<f64>;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2deg(const vec<f32, N>& y, const vec<f32, N>& x)
-{
- return atan2(y, x) * c_radtodeg<f32>;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2deg(const vec<f64, N>& y, const vec<f64, N>& x)
-{
- return atan2(y, x) * c_radtodeg<f64>;
-}
-
-KFR_I_FLT_CONVERTER(atan)
-KFR_I_FLT_CONVERTER(atan2)
-KFR_I_FLT_CONVERTER(atandeg)
-KFR_I_FLT_CONVERTER(atan2deg)
-} // namespace intrinsics
-KFR_I_FN(atan)
-KFR_I_FN(atandeg)
-KFR_I_FN(atan2)
-KFR_I_FN(atan2deg)
-} // namespace kfr
diff --git a/include/kfr/base/impl/clamp.hpp b/include/kfr/base/impl/clamp.hpp
@@ -1,56 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../min_max.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T>
-KFR_SINTRIN T clamp(const T& x, const T& lo, const T& hi)
-{
- return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
-{
- return max(min(x, hi), lo);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
-{
- return max(min(x, hi), zerovector<T, N>());
-}
-} // namespace intrinsics
-KFR_I_FN(clamp)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/gamma.hpp b/include/kfr/base/impl/gamma.hpp
@@ -1,72 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "../function.hpp"
-#include "../log_exp.hpp"
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-#if CMT_HAS_WARNING("-Wc99-extensions")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
-#endif
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-template <typename T>
-constexpr T gamma_precalc[] = {
- 0x2.81b263fec4e08p+0, 0x3.07b4100e04448p+16, -0xa.a0da01d4d4e2p+16, 0xf.05ccb27bb9dbp+16,
- -0xa.fa79616b7c6ep+16, 0x4.6dd6c10d4df5p+16, -0xf.a2304199eb4ap+12, 0x1.c21dd4aade3dp+12,
- -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0, -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24,
-};
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> gamma(const vec<T, N>& z)
-{
- constexpr size_t Count = arraysize(gamma_precalc<T>);
- vec<T, N> accm = gamma_precalc<T>[0];
- CMT_LOOP_UNROLL
- for (size_t k = 1; k < Count; k++)
- accm += gamma_precalc<T>[k] / (z + cast<utype<T>>(k));
- accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
- return accm / z;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> factorial_approx(const vec<T, N>& x)
-{
- return gamma(x + T(1));
-}
-KFR_I_FLT_CONVERTER(gamma)
-KFR_I_FLT_CONVERTER(factorial_approx)
-} // namespace intrinsics
-KFR_I_FN(gamma)
-KFR_I_FN(factorial_approx)
-
-} // namespace kfr
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/impl/hyperbolic.hpp b/include/kfr/base/impl/hyperbolic.hpp
@@ -1,100 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../log_exp.hpp"
-#include "../min_max.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sinh(const vec<T, N>& x)
-{
- const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
- return (exp(xx) - exp(-xx)) * Tout(0.5);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cosh(const vec<T, N>& x)
-{
- const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
- return (exp(xx) + exp(-xx)) * Tout(0.5);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> tanh(const vec<T, N>& x)
-{
- const vec<Tout, N> a = exp(2 * x);
- return (a - 1) / (a + 1);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> coth(const vec<T, N>& x)
-{
- const vec<Tout, N> a = exp(2 * x);
- return (a + 1) / (a - 1);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sinhcosh(const vec<T, N>& x)
-{
- const vec<Tout, N> a = exp(x);
- const vec<Tout, N> b = exp(-x);
- return subadd(a, b) * Tout(0.5);
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> coshsinh(const vec<T, N>& x)
-{
- const vec<Tout, N> a = exp(x);
- const vec<Tout, N> b = exp(-x);
- return addsub(a, b) * Tout(0.5);
-}
-
-KFR_I_FLT_CONVERTER(sinh)
-KFR_I_FLT_CONVERTER(cosh)
-KFR_I_FLT_CONVERTER(tanh)
-KFR_I_FLT_CONVERTER(coth)
-KFR_I_FLT_CONVERTER(sinhcosh)
-KFR_I_FLT_CONVERTER(coshsinh)
-} // namespace intrinsics
-KFR_I_FN(sinh)
-KFR_I_FN(cosh)
-KFR_I_FN(tanh)
-KFR_I_FN(coth)
-KFR_I_FN(sinhcosh)
-KFR_I_FN(coshsinh)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/log_exp.hpp b/include/kfr/base/impl/log_exp.hpp
@@ -1,315 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../clamp.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../min_max.hpp"
-#include "../operators.hpp"
-#include "../round.hpp"
-#include "../select.hpp"
-#include "../shuffle.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <size_t N>
-KFR_SINTRIN vec<i32, N> vilogbp1(const vec<f32, N>& d)
-{
- mask<i32, N> m = d < 5.421010862427522E-20f;
- vec<i32, N> q = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff;
- q = select(m, q - (64 + 0x7e), q - 0x7e);
- return q;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<i64, N> vilogbp1(const vec<f64, N>& d)
-{
- mask<i64, N> m = d < 4.9090934652977266E-91;
- vec<i64, N> q = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff;
- q = select(m, q - (300 + 0x03fe), q - 0x03fe);
- return q;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q)
-{
- vec<i32, N> m = q >> 31;
- m = (((m + q) >> 6) - m) << 4;
- const vec<i32, N> qq = q - (m << 2);
- m = clamp(m + 0x7f, vec<i32, N>(0xff));
- vec<f32, N> u = pow4(bitcast<f32>(cast<i32>(m) << 23));
- return x * u * bitcast<f32>((cast<i32>(qq + 0x7f)) << 23);
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
-{
- vec<i64, N> m = q >> 31;
- m = (((m + q) >> 9) - m) << 7;
- const vec<i64, N> qq = q - (m << 2);
- m = clamp(m + 0x3ff, i64(0x7ff));
- vec<f64, N> u = pow4(bitcast<f64>(cast<i64>(m) << 52));
- return x * u * bitcast<f64>((cast<i64>(qq + 0x3ff)) << 52);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x)
-{
- return select(x == T(), -c_infinity<T>, static_cast<vec<T, N>>(vilogbp1(x) - 1));
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d)
-{
- vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
- vec<f32, N> m = vldexpk(d, -e);
-
- vec<f32, N> x = (m - 1.0f) / (m + 1.0f);
- vec<f32, N> x2 = x * x;
-
- vec<f32, N> sp = select(d < 0, constants<f32>::qnan, constants<f32>::neginfinity);
-
- vec<f32, N> t = 0.2371599674224853515625f;
- t = fmadd(t, x2, 0.285279005765914916992188f);
- t = fmadd(t, x2, 0.400005519390106201171875f);
- t = fmadd(t, x2, 0.666666567325592041015625f);
- t = fmadd(t, x2, 2.0f);
-
- x = x * t + c_log_2<f32> * cast<f32>(e);
- x = select(d > 0, x, sp);
-
- return x;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d)
-{
- vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
- vec<f64, N> m = vldexpk(d, -e);
-
- vec<f64, N> x = (m - 1.0) / (m + 1.0);
- vec<f64, N> x2 = x * x;
-
- vec<f64, N> sp = select(d < 0, constants<f64>::qnan, constants<f64>::neginfinity);
-
- vec<f64, N> t = 0.148197055177935105296783;
- t = fmadd(t, x2, 0.153108178020442575739679);
- t = fmadd(t, x2, 0.181837339521549679055568);
- t = fmadd(t, x2, 0.22222194152736701733275);
- t = fmadd(t, x2, 0.285714288030134544449368);
- t = fmadd(t, x2, 0.399999999989941956712869);
- t = fmadd(t, x2, 0.666666666666685503450651);
- t = fmadd(t, x2, 2);
-
- x = x * t + constants<f64>::log_2 * cast<f64>(e);
- x = select(d > 0, x, sp);
-
- return x;
-}
-
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x)
-{
- return log(cast<Tout>(x)) * constants<Tout>::recip_log_2;
-}
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x)
-{
- return log(cast<Tout>(x)) * constants<Tout>::recip_log_10;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d)
-{
- const f32 ln2_part1 = 0.6931457519f;
- const f32 ln2_part2 = 1.4286067653e-6f;
-
- vec<i32, N> q = cast<i32>(floor(d * constants<f32>::recip_log_2));
- vec<f32, N> s, u;
-
- s = fmadd(cast<f32>(q), -ln2_part1, d);
- s = fmadd(cast<f32>(q), -ln2_part2, s);
-
- const f32 c2 = 0.4999999105930328369140625f;
- const f32 c3 = 0.166668415069580078125f;
- const f32 c4 = 4.16539050638675689697265625e-2f;
- const f32 c5 = 8.378830738365650177001953125e-3f;
- const f32 c6 = 1.304379315115511417388916015625e-3f;
- const f32 c7 = 2.7555381529964506626129150390625e-4f;
-
- u = c7;
- u = fmadd(u, s, c6);
- u = fmadd(u, s, c5);
- u = fmadd(u, s, c4);
- u = fmadd(u, s, c3);
- u = fmadd(u, s, c2);
-
- u = s * s * u + s + 1.0f;
- u = vldexpk(u, q);
-
- u = select(d == constants<f32>::neginfinity, 0.f, u);
-
- return u;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d)
-{
- const f64 ln2_part1 = 0.69314717501401901245;
- const f64 ln2_part2 = 5.545926273775592108e-009;
-
- vec<i64, N> q = cast<i64>(floor(d * +constants<f64>::recip_log_2));
- vec<f64, N> s, u;
-
- s = fmadd(cast<f64>(q), -ln2_part1, d);
- s = fmadd(cast<f64>(q), -ln2_part2, s);
-
- const f64 c2 = 0.499999999999994948485237955537741072475910186767578;
- const f64 c3 = 0.166666666667024204739888659787538927048444747924805;
- const f64 c4 = 4.16666666578945840693215529881854308769106864929199e-2;
- const f64 c5 = 8.3333334397461874404333670440792047884315252304077e-3;
- const f64 c6 = 1.3888881489747750223179290074426717183087021112442e-3;
- const f64 c7 = 1.9841587032493949419205414574918222569976933300495e-4;
- const f64 c8 = 2.47929324077393282239802768662784160369483288377523e-5;
- const f64 c9 = 2.77076037925831049422552981864598109496000688523054e-6;
- const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7;
- const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8;
-
- u = c11;
- u = fmadd(u, s, c10);
- u = fmadd(u, s, c9);
- u = fmadd(u, s, c8);
- u = fmadd(u, s, c7);
- u = fmadd(u, s, c6);
- u = fmadd(u, s, c5);
- u = fmadd(u, s, c4);
- u = fmadd(u, s, c3);
- u = fmadd(u, s, c2);
-
- u = s * s * u + s + 1.0;
- u = vldexpk(u, q);
-
- u = select(d == constants<f64>::neginfinity, 0.0, u);
-
- return u;
-}
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x)
-{
- return exp(x * constants<Tout>::log_2);
-}
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x)
-{
- return exp(x * constants<Tout>::log_10);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b)
-{
- const vec<T, N> t = exp(b * log(abs(a)));
- const mask<T, N> isint = floor(b) == b;
- const mask<T, N> iseven = (cast<itype<T>>(b) & 1) == 0;
- return select(
- a > T(), t,
- select(a == T(), T(), select(isint, select(iseven, t, -t), broadcast<N>(constants<T>::qnan))));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b)
-{
- return exp(reciprocal(b) * log(x));
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cbrt(const vec<T, N>& x)
-{
- return pow<T, N>(x, T(0.333333333333333333333333333333333));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cbrt(const vec<T, N>& x)
-{
- return cbrt(cast<Tout>(x));
-}
-
-KFR_I_FLT_CONVERTER(exp)
-KFR_I_FLT_CONVERTER(exp2)
-KFR_I_FLT_CONVERTER(exp10)
-KFR_I_FLT_CONVERTER(log)
-KFR_I_FLT_CONVERTER(log2)
-KFR_I_FLT_CONVERTER(log10)
-KFR_I_FLT_CONVERTER(logb)
-KFR_I_FLT_CONVERTER(pow)
-KFR_I_FLT_CONVERTER(root)
-KFR_I_FLT_CONVERTER(cbrt)
-
-template <typename T1, typename T2>
-KFR_SINTRIN flt_type<common_type<T1, T2>> logn(const T1& a, const T2& b)
-{
- return log(a) / log(b);
-}
-
-template <typename T1, typename T2>
-KFR_SINTRIN flt_type<common_type<T1, T2>> logm(const T1& a, const T2& b)
-{
- return log(a) * b;
-}
-
-template <typename T1, typename T2, typename T3>
-KFR_SINTRIN flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a)
-{
- return exp(fmadd(x, m, a));
-}
-
-template <typename T1, typename T2, typename T3>
-KFR_SINTRIN flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a)
-{
- return fmadd(log(x), m, a);
-}
-} // namespace intrinsics
-KFR_I_FN(exp)
-KFR_I_FN(exp2)
-KFR_I_FN(exp10)
-KFR_I_FN(log)
-KFR_I_FN(log2)
-KFR_I_FN(log10)
-KFR_I_FN(logb)
-KFR_I_FN(logn)
-KFR_I_FN(logm)
-KFR_I_FN(exp_fmadd)
-KFR_I_FN(log_fmadd)
-KFR_I_FN(pow)
-KFR_I_FN(root)
-KFR_I_FN(cbrt)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/logical.hpp b/include/kfr/base/impl/logical.hpp
@@ -1,289 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <size_t bits>
-struct bitmask
-{
- using type = conditional<(bits > 32), uint64_t,
- conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>;
-
- bitmask(type val) : value(val) {}
-
- template <typename Itype>
- bitmask(Itype val) : value(static_cast<type>(val))
- {
- }
-
- type value;
-};
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-#if defined CMT_ARCH_SSE41
-
-// horizontal OR
-KFR_SINTRIN bool bittestany(const u8sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32sse& x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64sse& x) { return !_mm_testz_si128(*x, *x); }
-
-// horizontal AND
-KFR_SINTRIN bool bittestall(const u8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-#endif
-
-#if defined CMT_ARCH_AVX
-// horizontal OR
-KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_ps(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_pd(*x, *x); }
-
-KFR_SINTRIN bool bittestany(const f32avx& x) { return !_mm256_testz_ps(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64avx& x) { return !_mm256_testz_pd(*x, *x); }
-
-KFR_SINTRIN bool bittestany(const u8avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32avx& x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64avx& x) { return !_mm256_testz_si256(*x, *x); }
-
-// horizontal AND
-KFR_SINTRIN bool bittestall(const f32sse& x) { return _mm_testc_ps(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const f64sse& x) { return _mm_testc_pd(*x, *allonesvector(x)); }
-
-KFR_SINTRIN bool bittestall(const f32avx& x) { return _mm256_testc_ps(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const f64avx& x) { return _mm256_testc_pd(*x, *allonesvector(x)); }
-
-KFR_SINTRIN bool bittestall(const u8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const u64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-
-#if defined CMT_ARCH_AVX512
-// horizontal OR
-KFR_SINTRIN bool bittestany(const f32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u8avx512& x) { return _mm512_test_epi8_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16avx512& x) { return _mm512_test_epi16_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8avx512& x) { return _mm512_test_epi8_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16avx512& x) { return _mm512_test_epi16_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
-
-// horizontal AND
-KFR_SINTRIN bool bittestall(const f32avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const f64avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u8avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u16avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u32avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u64avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i8avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i16avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i32avx512& x) { return !bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i64avx512& x) { return !bittestany(~x); }
-
-#endif
-
-#elif defined CMT_ARCH_SSE41
-KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
-KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
-KFR_SINTRIN bool bittestall(const f32sse& x)
-{
- return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
-}
-KFR_SINTRIN bool bittestall(const f64sse& x)
-{
- return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
-}
-#endif
-
-#if !defined CMT_ARCH_SSE41
-
-KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); }
-KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); }
-KFR_SINTRIN bool bittestany(const u8sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const u16sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const u32sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const u64sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i8sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i16sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i32sse& x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(const i64sse& x) { return _mm_movemask_epi8(*x); }
-
-KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); }
-KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); }
-KFR_SINTRIN bool bittestall(const u8sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const u16sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const u32sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const u64sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i8sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i16sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); }
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
- return bittestall(expand_simd(a, internal::maskbits<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
- return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
- return bittestany(expand_simd(a, internal::maskbits<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
- return bittestany(low(a)) || bittestany(high(a));
-}
-
-#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN bool bittestall(const u32neon& a)
-{
- const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a));
- return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
-}
-
-KFR_SINTRIN bool bittestany(const u32neon& a)
-{
- const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a));
- return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
-}
-KFR_SINTRIN bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
-
-KFR_SINTRIN bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
-KFR_SINTRIN bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
- return bittestall(expand_simd(a, internal::maskbits<T>(true)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestall(const vec<T, N>& a)
-{
- return bittestall(low(a)) && bittestall(high(a));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
- return bittestany(expand_simd(a, internal::maskbits<T>(false)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN bool bittestany(const vec<T, N>& a)
-{
- return bittestany(low(a)) || bittestany(high(a));
-}
-
-#else
-
-template <typename T, size_t N>
-KFR_SINTRIN bitmask<N> getmask(const vec<T, N>& x)
-{
- typename bitmask<N>::type val = 0;
- for (size_t i = 0; i < N; i++)
- {
- val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
- }
- return val;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestany(const vec<T, N>& x)
-{
- return getmask(x).value;
-}
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
-{
- return bittestany(x & y);
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestall(const vec<T, N>& x)
-{
- return !getmask(~x).value;
-}
-template <typename T, size_t N>
-KFR_SINTRIN bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
-{
- return !bittestany(~x & y);
-}
-#endif
-} // namespace intrinsics
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/min_max.hpp b/include/kfr/base/impl/min_max.hpp
@@ -1,232 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); }
-KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); }
-KFR_SINTRIN u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(*x, *y); }
-KFR_SINTRIN i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(*x, *y); }
-
-KFR_SINTRIN f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(*x, *y); }
-KFR_SINTRIN f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(*x, *y); }
-KFR_SINTRIN u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(*x, *y); }
-KFR_SINTRIN i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(*x, *y); }
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(*x, *y); }
-KFR_SINTRIN i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(*x, *y); }
-KFR_SINTRIN i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(*x, *y); }
-KFR_SINTRIN u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(*x, *y); }
-KFR_SINTRIN i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(*x, *y); }
-KFR_SINTRIN u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(*x, *y); }
-
-KFR_SINTRIN u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(*x, *y); }
-KFR_SINTRIN i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(*x, *y); }
-KFR_SINTRIN i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(*x, *y); }
-KFR_SINTRIN u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(*x, *y); }
-KFR_SINTRIN i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(*x, *y); }
-KFR_SINTRIN u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(*x, *y); }
-
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(*x, *y); }
-KFR_SINTRIN i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(*x, *y); }
-KFR_SINTRIN i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(*x, *y); }
-KFR_SINTRIN i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(*x, *y); }
-KFR_SINTRIN u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(*x, *y); }
-KFR_SINTRIN u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(*x, *y); }
-KFR_SINTRIN i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(*x, *y); }
-KFR_SINTRIN i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(*x, *y); }
-KFR_SINTRIN i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(*x, *y); }
-KFR_SINTRIN u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(*x, *y); }
-KFR_SINTRIN i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(*x, *y); }
-KFR_SINTRIN u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(*x, *y); }
-KFR_SINTRIN i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(*x, *y); }
-KFR_SINTRIN u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(*x, *y); }
-
-KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(*x, *y); }
-KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(*x, *y); }
-KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(*x, *y); }
-KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(*x, *y); }
-
-KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(*x, *y); }
-KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(*x, *y); }
-KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(*x, *y); }
-KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(*x, *y); }
-#else
-KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
-KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
-KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
-KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
-#endif
-
-#if defined CMT_ARCH_AVX
-KFR_SINTRIN f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(*x, *y); }
-KFR_SINTRIN f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(*x, *y); }
-KFR_SINTRIN f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(*x, *y); }
-KFR_SINTRIN f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(*x, *y); }
-#endif
-
-#if defined CMT_ARCH_SSE41
-KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(*x, *y); }
-KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(*x, *y); }
-KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(*x, *y); }
-KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(*x, *y); }
-
-KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(*x, *y); }
-KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(*x, *y); }
-KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(*x, *y); }
-KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(*x, *y); }
-#else
-KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
-
-KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
-KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
-
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); }
-KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); }
-KFR_SINTRIN i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(*x, *y); }
-KFR_SINTRIN u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(*x, *y); }
-KFR_SINTRIN i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(*x, *y); }
-KFR_SINTRIN u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(*x, *y); }
-KFR_SINTRIN i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
-KFR_SINTRIN u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
-
-KFR_SINTRIN i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(*x, *y); }
-KFR_SINTRIN u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(*x, *y); }
-KFR_SINTRIN i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(*x, *y); }
-KFR_SINTRIN u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(*x, *y); }
-KFR_SINTRIN i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(*x, *y); }
-KFR_SINTRIN u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(*x, *y); }
-KFR_SINTRIN i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
-KFR_SINTRIN u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
-
-KFR_SINTRIN f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(*x, *y); }
-KFR_SINTRIN f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(*x, *y); }
-#if defined CMT_ARCH_NEON64
-KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(*x, *y); }
-KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(*x, *y); }
-#else
-KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
-KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(min)
-KFR_HANDLE_ALL_SIZES_2(max)
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
-{
- return select(x < y, x, y);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
-{
- return select(x > y, x, y);
-}
-#endif
-
-template <typename T>
-KFR_SINTRIN T min(initialvalue<T>)
-{
- return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
- : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_SINTRIN T max(initialvalue<T>)
-{
- return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
- : std::numeric_limits<T>::min();
-}
-template <typename T>
-KFR_SINTRIN T absmin(initialvalue<T>)
-{
- return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
- : std::numeric_limits<T>::max();
-}
-template <typename T>
-KFR_SINTRIN T absmax(initialvalue<T>)
-{
- return 0;
-}
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
-{
- return min(abs(x), abs(y));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
-{
- return max(abs(x), abs(y));
-}
-
-KFR_I_CONVERTER(min)
-KFR_I_CONVERTER(max)
-KFR_I_CONVERTER(absmin)
-KFR_I_CONVERTER(absmax)
-} // namespace intrinsics
-KFR_I_FN(min)
-KFR_I_FN(max)
-KFR_I_FN(absmin)
-KFR_I_FN(absmax)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/modzerobessel.hpp b/include/kfr/base/impl/modzerobessel.hpp
@@ -1,105 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../log_exp.hpp"
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-#if CMT_HAS_WARNING("-Wc99-extensions")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
-#endif
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
-{
- constexpr static T bessel_coef[] = { T(0.25),
- T(0.027777777777777776236),
- T(0.0017361111111111110147),
- T(6.9444444444444444384e-005),
- T(1.9290123456790123911e-006),
- T(3.9367598891408417495e-008),
- T(6.1511873267825652335e-010),
- T(7.5940584281266239246e-012),
- T(7.5940584281266233693e-014),
- T(6.2760813455591932909e-016),
- T(4.3583898233049949985e-018),
- T(2.5789288895295827557e-020),
- T(1.3157800456783586208e-022),
- T(5.8479113141260384983e-025),
- T(2.2843403570804837884e-027),
- T(7.904291893012054025e-030),
- T(2.4395962632753252792e-032),
- T(6.75788438580422547e-035),
- T(1.689471096451056426e-037),
- T(3.8310002187098784929e-040),
- T(7.9152897080782616517e-043),
- T(1.4962740468957016443e-045),
- T(2.5976979980828152196e-048),
- T(4.1563167969325041577e-051),
- T(6.1483976285983795968e-054),
- T(8.434015951438105991e-057),
- T(1.0757673407446563809e-059),
- T(1.2791526049282476926e-062),
- T(1.4212806721424974034e-065),
- T(1.4789601166935457918e-068),
- T(1.4442969889585408123e-071),
- T(1.3262598613026086927e-074),
- T(1.1472836170437790782e-077),
- T(9.3655805472961564331e-081),
- T(7.2265282000741942594e-084),
- T(5.2786911614858977913e-087),
- T(3.6556032974279072401e-090),
- T(2.4034209713529963119e-093),
- T(1.5021381070956226783e-096) };
-
- const vec<T, N> x_2 = x * 0.5;
- const vec<T, N> x_2_sqr = x_2 * x_2;
- vec<T, N> num = x_2_sqr;
- vec<T, N> result;
- result = 1 + x_2_sqr;
-
- CMT_LOOP_UNROLL
- for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
- {
- result = fmadd((num *= x_2_sqr), bessel_coef[i], result);
- }
- return result;
-}
-
-KFR_I_CONVERTER(modzerobessel)
-} // namespace intrinsics
-KFR_I_FN(modzerobessel)
-
-} // namespace kfr
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/impl/round.hpp b/include/kfr/base/impl/round.hpp
@@ -1,255 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../operators.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_ss(V) \
- _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm_roundnearest_sd(V) \
- _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
-#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
-#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
-
-#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#define KFR_mm512_trunc_ps(V) _mm512_roundscale_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm512_roundnearest_ps(V) _mm512_roundscale_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-#define KFR_mm512_trunc_pd(V) _mm512_roundscale_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
-#define KFR_mm512_roundnearest_pd(V) _mm512_roundscale_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); }
-KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); }
-KFR_SINTRIN f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(*value); }
-KFR_SINTRIN f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(*value); }
-KFR_SINTRIN f64sse floor(const f64sse& value) { return _mm_floor_pd(*value); }
-KFR_SINTRIN f64sse ceil(const f64sse& value) { return _mm_ceil_pd(*value); }
-KFR_SINTRIN f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(*value); }
-KFR_SINTRIN f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(*value); }
-KFR_SINTRIN f32sse fract(const f32sse& x) { return x - floor(x); }
-KFR_SINTRIN f64sse fract(const f64sse& x) { return x - floor(x); }
-
-#if defined CMT_ARCH_AVX
-
-KFR_SINTRIN f32avx floor(const f32avx& value) { return _mm256_floor_ps(*value); }
-KFR_SINTRIN f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(*value); }
-KFR_SINTRIN f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(*value); }
-KFR_SINTRIN f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(*value); }
-KFR_SINTRIN f64avx floor(const f64avx& value) { return _mm256_floor_pd(*value); }
-KFR_SINTRIN f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(*value); }
-KFR_SINTRIN f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(*value); }
-KFR_SINTRIN f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(*value); }
-KFR_SINTRIN f32avx fract(const f32avx& x) { return x - floor(x); }
-KFR_SINTRIN f64avx fract(const f64avx& x) { return x - floor(x); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-
-KFR_SINTRIN f32avx512 floor(const f32avx512& value) { return _mm512_floor_ps(*value); }
-KFR_SINTRIN f32avx512 ceil(const f32avx512& value) { return _mm512_ceil_ps(*value); }
-KFR_SINTRIN f32avx512 trunc(const f32avx512& value) { return KFR_mm512_trunc_ps(*value); }
-KFR_SINTRIN f32avx512 round(const f32avx512& value) { return KFR_mm512_roundnearest_ps(*value); }
-KFR_SINTRIN f64avx512 floor(const f64avx512& value) { return _mm512_floor_pd(*value); }
-KFR_SINTRIN f64avx512 ceil(const f64avx512& value) { return _mm512_ceil_pd(*value); }
-KFR_SINTRIN f64avx512 trunc(const f64avx512& value) { return KFR_mm512_trunc_pd(*value); }
-KFR_SINTRIN f64avx512 round(const f64avx512& value) { return KFR_mm512_roundnearest_pd(*value); }
-KFR_SINTRIN f32avx512 fract(const f32avx512& x) { return x - floor(x); }
-KFR_SINTRIN f64avx512 fract(const f64avx512& x) { return x - floor(x); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_F_1(floor)
-KFR_HANDLE_ALL_SIZES_F_1(ceil)
-KFR_HANDLE_ALL_SIZES_F_1(round)
-KFR_HANDLE_ALL_SIZES_F_1(trunc)
-KFR_HANDLE_ALL_SIZES_F_1(fract)
-
-#else
-
-// fallback
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x)
-{
- vec<f32, N> t = cast<f32>(cast<i32>(x));
- return t - select(x < t, 1.f, 0.f);
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x)
-{
- vec<f64, N> t = cast<f64>(cast<i64>(x));
- return t - select(x < t, 1., 0.);
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x)
-{
- vec<f32, N> t = cast<f32>(cast<i32>(x));
- return t + select(x > t, 1.f, 0.f);
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x)
-{
- vec<f64, N> t = cast<f64>(cast<i64>(x));
- return t + select(x > t, 1., 0.);
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x)
-{
- return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x)));
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> round(const vec<f64, N>& x)
-{
- return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x)));
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> trunc(const vec<f32, N>& x)
-{
- return cast<f32>(cast<i32>(x));
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> trunc(const vec<f64, N>& x)
-{
- return cast<f64>(cast<i64>(x));
-}
-template <size_t N>
-KFR_SINTRIN vec<f32, N> fract(const vec<f32, N>& x)
-{
- return x - floor(x);
-}
-template <size_t N>
-KFR_SINTRIN vec<f64, N> fract(const vec<f64, N>& x)
-{
- return x - floor(x);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> floor(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> ceil(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> trunc(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> round(const vec<T, N>& value)
-{
- return value;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fract(const vec<T, N>&)
-{
- return T(0);
-}
-
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> ifloor(const vec<T, N>& value)
-{
- return cast<IT>(floor(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> iceil(const vec<T, N>& value)
-{
- return cast<IT>(ceil(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> itrunc(const vec<T, N>& value)
-{
- return cast<IT>(trunc(value));
-}
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> iround(const vec<T, N>& value)
-{
- return cast<IT>(round(value));
-}
-
-KFR_I_CONVERTER(floor)
-KFR_I_CONVERTER(ceil)
-KFR_I_CONVERTER(round)
-KFR_I_CONVERTER(trunc)
-KFR_I_CONVERTER(fract)
-KFR_I_CONVERTER(ifloor)
-KFR_I_CONVERTER(iceil)
-KFR_I_CONVERTER(iround)
-KFR_I_CONVERTER(itrunc)
-} // namespace intrinsics
-KFR_I_FN(floor)
-KFR_I_FN(ceil)
-KFR_I_FN(round)
-KFR_I_FN(trunc)
-KFR_I_FN(fract)
-KFR_I_FN(ifloor)
-KFR_I_FN(iceil)
-KFR_I_FN(iround)
-KFR_I_FN(itrunc)
-
-} // namespace kfr
-
-#undef KFR_mm_trunc_ps
-#undef KFR_mm_roundnearest_ps
-#undef KFR_mm_trunc_pd
-#undef KFR_mm_roundnearest_pd
-#undef KFR_mm_trunc_ss
-#undef KFR_mm_roundnearest_ss
-#undef KFR_mm_trunc_sd
-#undef KFR_mm_roundnearest_sd
-#undef KFR_mm_floor_ss
-#undef KFR_mm_floor_sd
-#undef KFR_mm_ceil_ss
-#undef KFR_mm_ceil_sd
-#undef KFR_mm256_trunc_ps
-#undef KFR_mm256_roundnearest_ps
-#undef KFR_mm256_trunc_pd
-#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/base/impl/saturation.hpp b/include/kfr/base/impl/saturation.hpp
@@ -1,192 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-#include "../select.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-// Generic functions
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
-{
- using UT = utype<T>;
- constexpr size_t shift = typebits<UT>::bits - 1;
- vec<UT, N> aa = bitcast<UT>(a);
- vec<UT, N> bb = bitcast<UT>(b);
- const vec<UT, N> sum = aa + bb;
- aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
- return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= 0, a, bitcast<T>(sum));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
- using UT = utype<T>;
- constexpr size_t shift = typebits<UT>::bits - 1;
- vec<UT, N> aa = bitcast<UT>(a);
- vec<UT, N> bb = bitcast<UT>(b);
- const vec<UT, N> diff = aa - bb;
- aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
-
- return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < 0, a, bitcast<T>(diff));
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
-{
- const vec<T, N> t = allonesvector(a);
- return select(a > t - b, t, a + b);
-}
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return select(a < b, zerovector(a), a - b);
-}
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); }
-KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); }
-KFR_SINTRIN u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(*x, *y); }
-KFR_SINTRIN i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(*x, *y); }
-
-KFR_SINTRIN u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(*x, *y); }
-KFR_SINTRIN i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(*x, *y); }
-KFR_SINTRIN u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(*x, *y); }
-KFR_SINTRIN i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(*x, *y); }
-
-KFR_SINTRIN i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
-
-KFR_SINTRIN i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(*x, *y); }
-KFR_SINTRIN i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(*x, *y); }
-KFR_SINTRIN u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(*x, *y); }
-KFR_SINTRIN i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(*x, *y); }
-
-KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(*x, *y); }
-KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); }
-KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); }
-KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); }
-
-KFR_SINTRIN i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
-
-KFR_SINTRIN i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(*x, *y); }
-KFR_SINTRIN i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(*x, *y); }
-KFR_SINTRIN i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(*x, *y); }
-KFR_SINTRIN u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(*x, *y); }
-KFR_SINTRIN i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(*x, *y); }
-KFR_SINTRIN u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(*x, *y); }
-KFR_SINTRIN i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(*x, *y); }
-
-KFR_SINTRIN i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32avx512 satadd(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64avx512 satadd(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32avx512 satsub(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64avx512 satsub(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_sub(a, b); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); }
-KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); }
-KFR_SINTRIN u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(*x, *y); }
-KFR_SINTRIN i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(*x, *y); }
-KFR_SINTRIN u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(*a, *b); }
-KFR_SINTRIN i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(*a, *b); }
-KFR_SINTRIN u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(*a, *b); }
-KFR_SINTRIN i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(*a, *b); }
-
-KFR_SINTRIN u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(*x, *y); }
-KFR_SINTRIN i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(*x, *y); }
-KFR_SINTRIN u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(*x, *y); }
-KFR_SINTRIN i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(*x, *y); }
-KFR_SINTRIN u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(*a, *b); }
-KFR_SINTRIN i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(*a, *b); }
-KFR_SINTRIN u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(*a, *b); }
-KFR_SINTRIN i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(*a, *b); }
-
-KFR_HANDLE_ALL_SIZES_2(satadd)
-KFR_HANDLE_ALL_SIZES_2(satsub)
-
-#else
-// fallback
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_signed_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_unsigned_add(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_signed_sub(a, b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return saturated_unsigned_sub(a, b);
-}
-#endif
-KFR_I_CONVERTER(satadd)
-KFR_I_CONVERTER(satsub)
-} // namespace intrinsics
-KFR_I_FN(satadd)
-KFR_I_FN(satsub)
-} // namespace kfr
diff --git a/include/kfr/base/impl/select.hpp b/include/kfr/base/impl/select.hpp
@@ -1,261 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-
-namespace kfr
-{
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN u8sse select(const maskfor<u8sse>& m, const u8sse& x, const u8sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u16sse select(const maskfor<u16sse>& m, const u16sse& x, const u16sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u32sse select(const maskfor<u32sse>& m, const u32sse& x, const u32sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u64sse select(const maskfor<u64sse>& m, const u64sse& x, const u64sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i8sse select(const maskfor<i8sse>& m, const i8sse& x, const i8sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i16sse select(const maskfor<i16sse>& m, const i16sse& x, const i16sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i32sse select(const maskfor<i32sse>& m, const i32sse& x, const i32sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i64sse select(const maskfor<i64sse>& m, const i64sse& x, const i64sse& y)
-{
- return _mm_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN f32sse select(const maskfor<f32sse>& m, const f32sse& x, const f32sse& y)
-{
- return _mm_blendv_ps(*y, *x, *m);
-}
-KFR_SINTRIN f64sse select(const maskfor<f64sse>& m, const f64sse& x, const f64sse& y)
-{
- return _mm_blendv_pd(*y, *x, *m);
-}
-
-#if defined CMT_ARCH_AVX
-KFR_SINTRIN f64avx select(const maskfor<f64avx>& m, const f64avx& x, const f64avx& y)
-{
- return _mm256_blendv_pd(*y, *x, *m);
-}
-KFR_SINTRIN f32avx select(const maskfor<f32avx>& m, const f32avx& x, const f32avx& y)
-{
- return _mm256_blendv_ps(*y, *x, *m);
-}
-#endif
-
-#if defined CMT_ARCH_AVX2
-KFR_SINTRIN u8avx select(const maskfor<u8avx>& m, const u8avx& x, const u8avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u16avx select(const maskfor<u16avx>& m, const u16avx& x, const u16avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u32avx select(const maskfor<u32avx>& m, const u32avx& x, const u32avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN u64avx select(const maskfor<u64avx>& m, const u64avx& x, const u64avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i8avx select(const maskfor<i8avx>& m, const i8avx& x, const i8avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i16avx select(const maskfor<i16avx>& m, const i16avx& x, const i16avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i32avx select(const maskfor<i32avx>& m, const i32avx& x, const i32avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-KFR_SINTRIN i64avx select(const maskfor<i64avx>& m, const i64avx& x, const i64avx& y)
-{
- return _mm256_blendv_epi8(*y, *x, *m);
-}
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN f64avx512 select(const maskfor<f64avx512>& m, const f64avx512& x, const f64avx512& y)
-{
- return _mm512_mask_blend_pd(_mm512_test_epi64_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN f32avx512 select(const maskfor<f32avx512>& m, const f32avx512& x, const f32avx512& y)
-{
- return _mm512_mask_blend_ps(_mm512_test_epi32_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u8avx512 select(const maskfor<u8avx512>& m, const u8avx512& x, const u8avx512& y)
-{
- return _mm512_mask_blend_epi8(_mm512_test_epi8_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u16avx512 select(const maskfor<u16avx512>& m, const u16avx512& x, const u16avx512& y)
-{
- return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u32avx512 select(const maskfor<u32avx512>& m, const u32avx512& x, const u32avx512& y)
-{
- return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN u64avx512 select(const maskfor<u64avx512>& m, const u64avx512& x, const u64avx512& y)
-{
- return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const i8avx512& y)
-{
- return _mm512_mask_blend_epi8(_mm512_test_epi8_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i16avx512 select(const maskfor<i16avx512>& m, const i16avx512& x, const i16avx512& y)
-{
- return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i32avx512 select(const maskfor<i32avx512>& m, const i32avx512& x, const i32avx512& y)
-{
- return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x);
-}
-KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, const i64avx512& y)
-{
- return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x);
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- return concat(select(low(a.asvec()).asmask(), low(b), low(c)),
- select(high(a.asvec()).asmask(), high(b), high(c)));
-}
-
-#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32neon select(const maskfor<f32neon>& m, const f32neon& x, const f32neon& y)
-{
- return vbslq_f32(*m, *x, *y);
-}
-
-KFR_SINTRIN i8neon select(const maskfor<i8neon>& m, const i8neon& x, const i8neon& y)
-{
- return vbslq_s8(*m, *x, *y);
-}
-KFR_SINTRIN u8neon select(const maskfor<u8neon>& m, const u8neon& x, const u8neon& y)
-{
- return vbslq_u8(*m, *x, *y);
-}
-KFR_SINTRIN i16neon select(const maskfor<i16neon>& m, const i16neon& x, const i16neon& y)
-{
- return vbslq_s16(*m, *x, *y);
-}
-KFR_SINTRIN u16neon select(const maskfor<u16neon>& m, const u16neon& x, const u16neon& y)
-{
- return vbslq_u16(*m, *x, *y);
-}
-KFR_SINTRIN i32neon select(const maskfor<i32neon>& m, const i32neon& x, const i32neon& y)
-{
- return vbslq_s32(*m, *x, *y);
-}
-KFR_SINTRIN u32neon select(const maskfor<u32neon>& m, const u32neon& x, const u32neon& y)
-{
- return vbslq_u32(*m, *x, *y);
-}
-KFR_SINTRIN i64neon select(const maskfor<i64neon>& m, const i64neon& x, const i64neon& y)
-{
- return vbslq_s64(*m, *x, *y);
-}
-KFR_SINTRIN u64neon select(const maskfor<u64neon>& m, const u64neon& x, const u64neon& y)
-{
- return vbslq_u64(*m, *x, *y);
-}
-
-#ifdef CMT_ARCH_NEON64
-KFR_SINTRIN f64neon select(const maskfor<f64neon>& m, const f64neon& x, const f64neon& y)
-{
- return vbslq_f64(*m, *x, *y);
-}
-#else
-KFR_SINTRIN f64neon select(const maskfor<f64neon>& m, const f64neon& x, const f64neon& y)
-{
- return y ^ ((x ^ y) & m.asvec());
-}
-#endif
-
-template <typename T, size_t N, KFR_ENABLE_IF(N < platform<T>::vector_width)>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- return slice<0, N>(select(expand_simd(a.asvec()).asmask(), expand_simd(b), expand_simd(c)));
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= platform<T>::vector_width), typename = void>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
-{
- return concat(select(low(a.asvec()).asmask(), low(b), low(c)),
- select(high(a.asvec()).asmask(), high(b), high(c)));
-}
-
-#else
-
-// fallback
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
-{
- return y ^ ((x ^ y) & m.asvec());
-}
-#endif
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
-{
- return select(m.asmask(), x, y);
-}
-} // namespace intrinsics
-KFR_I_FN(select)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/sin_cos.hpp b/include/kfr/base/impl/sin_cos.hpp
@@ -1,338 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../min_max.hpp"
-#include "../operators.hpp"
-#include "../round.hpp"
-#include "../select.hpp"
-#include "../shuffle.hpp"
-
-#if CMT_HAS_WARNING("-Wc99-extensions")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
-#endif
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0)
-{
- return select(msk, a0, b0);
-}
-
-template <typename T, size_t N, typename... Ts>
-KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0,
- const T& a1, const T& b1, const Ts&... values)
-{
- return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
-}
-
-template <typename T, size_t N, typename Tprecise = f64>
-KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
-{
- const vec<T, N> xabs = abs(x);
- constexpr T div = constants<T>::fold_constant_div;
- vec<T, N> y = floor(xabs / div);
- quadrant = cast<itype<T>>(y - floor(y * T(1.0 / 16.0)) * T(16.0));
-
- const mask<T, N> msk = (quadrant & 1) != 0;
- quadrant = kfr::select(msk, quadrant + 1, quadrant);
- y = select(msk, y + T(1.0), y);
- quadrant = quadrant & 7;
-
- constexpr Tprecise hi = cast<Tprecise>(constants<T>::fold_constant_hi);
- constexpr T rem1 = constants<T>::fold_constant_rem1;
- constexpr T rem2 = constants<T>::fold_constant_rem2;
- return cast<T>(cast<Tprecise>(xabs) - cast<Tprecise>(y) * hi) - y * rem1 - y * rem2;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask)
-{
- constexpr f32 sin_c2 = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f);
- constexpr f32 sin_c4 = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f);
- constexpr f32 sin_c6 = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f);
- constexpr f32 sin_c8 = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f);
- constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f);
- constexpr f32 cos_c2 = CMT_FP(-0x8.p-4f, -5.0000000000e-01f);
- constexpr f32 cos_c4 = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f);
- constexpr f32 cos_c6 = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f);
- constexpr f32 cos_c8 = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f);
- constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f);
-
- const vec<f32, N> x2 = folded * folded;
-
- vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6,
- cos_c8, sin_c8, cos_c10, sin_c10);
-
- formula = select(cosmask, formula, formula * folded);
- return formula;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask)
-{
- constexpr f64 sin_c2 = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01);
- constexpr f64 sin_c4 = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03);
- constexpr f64 sin_c6 = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04);
- constexpr f64 sin_c8 = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06);
- constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08);
- constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10);
- constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13);
- constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14);
- constexpr f64 cos_c2 = CMT_FP(-0x8.p-4, -5.000000000000000000e-01);
- constexpr f64 cos_c4 = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02);
- constexpr f64 cos_c6 = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03);
- constexpr f64 cos_c8 = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05);
- constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07);
- constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09);
- constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11);
- constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14);
-
- vec<f64, N> x2 = folded * folded;
- vec<f64, N> formula =
- trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8,
- cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16);
-
- formula = select(cosmask, formula, formula * folded);
- return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
-KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
-{
- vec<itype<T>, N> quadrant;
- vec<T, N> folded = trig_fold(x_full, quadrant);
-
- mask<T, N> flip_sign =
- kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask();
-
- mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
- usecos = usecos ^ cosmask;
-
- vec<T, N> formula = trig_sincos(folded, usecos);
-
- mask<T, N> negmask = x_full < 0;
-
- flip_sign = flip_sign ^ (negmask & ~cosmask);
-
- formula = select(flip_sign, -formula, formula);
- return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x)
-{
- vec<itype<T>, N> quadrant;
- vec<T, N> folded = trig_fold(x, quadrant);
-
- mask<T, N> flip_sign = quadrant >= 4;
- mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
-
- vec<T, N> formula = trig_sincos(folded, usecos);
-
- formula = select(flip_sign ^ mask<T, N>(x), -formula, formula);
- return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x)
-{
- vec<itype<T>, N> quadrant;
- vec<T, N> folded = trig_fold(x, quadrant);
-
- mask<T, N> eq4 = (quadrant == 4);
- mask<T, N> flip_sign = (quadrant == 2) || eq4;
- mask<T, N> usecos = (quadrant == 0) || eq4;
-
- vec<T, N> formula = trig_sincos(folded, usecos);
-
- formula = select(flip_sign, -formula, formula);
- return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x)
-{
- const vec<T, N> msk = broadcast<N>(constants<T>::highbitmask());
-
- constexpr static T c2 = -0.16665853559970855712890625;
- constexpr static T c4 = +8.31427983939647674560546875e-3;
- constexpr static T c6 = -1.85423981747590005397796630859375e-4;
-
- const vec<T, N> pi = c_pi<T>;
-
- vec<T, N> xx = x - pi;
- vec<T, N> y = abs(xx);
- y = select(y > c_pi<T, 1, 2>, pi - y, y);
- y = y ^ (msk & ~xx);
-
- vec<T, N> y2 = y * y;
- vec<T, N> formula = c6;
- vec<T, N> y3 = y2 * y;
- formula = fmadd(formula, y2, c4);
- formula = fmadd(formula, y2, c2);
- formula = formula * y3 + y;
- return formula;
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fastcos(const vec<T, N>& x)
-{
- x += c_pi<T, 1, 2>;
- x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
- return fastsin(x);
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sincos(const vec<T, N>& x)
-{
- return sincos_mask(x, internal::oddmask<T, N>());
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x)
-{
- return sincos_mask(x, internal::evenmask<T, N>());
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x)
-{
- return select(abs(x) <= constants<T>::epsilon, T(1), sin(x) / x);
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sin(const vec<T, N>& x)
-{
- return sin(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cos(const vec<T, N>& x)
-{
- return cos(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> fastsin(const vec<T, N>& x)
-{
- return fastsin(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> fastcos(const vec<T, N>& x)
-{
- return fastcos(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sincos(const vec<T, N>& x)
-{
- return sincos(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> cossin(const vec<T, N>& x)
-{
- return cossin(cast<Tout>(x));
-}
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sinc(const vec<T, N>& x)
-{
- return sinc(cast<Tout>(x));
-}
-
-KFR_I_FLT_CONVERTER(sin)
-KFR_I_FLT_CONVERTER(cos)
-KFR_I_FLT_CONVERTER(fastsin)
-KFR_I_FLT_CONVERTER(fastcos)
-KFR_I_FLT_CONVERTER(sincos)
-KFR_I_FLT_CONVERTER(cossin)
-KFR_I_FLT_CONVERTER(sinc)
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout sindeg(const T& x)
-{
- return sin(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout cosdeg(const T& x)
-{
- return cos(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout fastsindeg(const T& x)
-{
- return fastsin(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout fastcosdeg(const T& x)
-{
- return fastcos(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout sincosdeg(const T& x)
-{
- return sincos(x * constants<Tout>::degtorad);
-}
-
-template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout cossindeg(const T& x)
-{
- return cossin(x * constants<Tout>::degtorad);
-}
-} // namespace intrinsics
-
-KFR_I_FN(sin)
-KFR_I_FN(cos)
-KFR_I_FN(fastsin)
-KFR_I_FN(fastcos)
-KFR_I_FN(sincos)
-KFR_I_FN(cossin)
-
-KFR_I_FN(sindeg)
-KFR_I_FN(cosdeg)
-KFR_I_FN(fastsindeg)
-KFR_I_FN(fastcosdeg)
-KFR_I_FN(sincosdeg)
-KFR_I_FN(cossindeg)
-
-KFR_I_FN(sinc)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/sqrt.hpp b/include/kfr/base/impl/sqrt.hpp
@@ -1,71 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../function.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
-
-KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(f32x4(_mm_sqrt_ss(*extend<4>(x)))); }
-KFR_SINTRIN f64x1 sqrt(const f64x1& x)
-{
- return slice<0, 1>(f64x2(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x))));
-}
-KFR_SINTRIN f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(*x); }
-KFR_SINTRIN f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(*x); }
-
-#if defined CMT_ARCH_AVX
-KFR_SINTRIN f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(*x); }
-KFR_SINTRIN f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(*x); }
-#endif
-
-#if defined CMT_ARCH_AVX512
-KFR_SINTRIN f32avx512 sqrt(const f32avx512& x) { return _mm512_sqrt_ps(*x); }
-KFR_SINTRIN f64avx512 sqrt(const f64avx512& x) { return _mm512_sqrt_pd(*x); }
-#endif
-
-KFR_HANDLE_ALL_SIZES_FLT_1(sqrt)
-
-#else
-
-// fallback
-template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sqrt(const vec<T, N>& x)
-{
- return apply([](Tout x) { return std::sqrt(x); }, cast<Tout>(x));
-}
-#endif
-KFR_I_FLT_CONVERTER(sqrt)
-} // namespace intrinsics
-KFR_I_FN(sqrt)
-
-} // namespace kfr
diff --git a/include/kfr/base/impl/tan.hpp b/include/kfr/base/impl/tan.hpp
@@ -1,141 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../abs.hpp"
-#include "../constants.hpp"
-#include "../function.hpp"
-#include "../operators.hpp"
-#include "../select.hpp"
-#include "../sin_cos.hpp"
-
-namespace kfr
-{
-
-namespace intrinsics
-{
-
-template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse)
-{
- constexpr T pi_14 = c_pi<T, 1, 4>;
-
- vec<T, N> y = abs(x_full);
- vec<T, N> scaled = y / pi_14;
-
- vec<T, N> k_real = floor(scaled);
- vec<IT, N> k = cast<IT>(k_real);
-
- vec<T, N> x = y - k_real * pi_14;
-
- mask<T, N> need_offset = (k & 1) != 0;
- x = select(need_offset, x - pi_14, x);
-
- vec<IT, N> k_mod4 = k & 3;
- inverse = (k_mod4 == 1) || (k_mod4 == 2);
- return x;
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full)
-{
- mask<f32, N> inverse;
- vec<i32, N> quad;
- const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
- inverse = quad == 2 || quad == 6;
-
- constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
- constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
- constexpr f32 tan_c6 = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02);
- constexpr f32 tan_c8 = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02);
- constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03);
- constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03);
-
- constexpr f32 cot_c2 = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01);
- constexpr f32 cot_c4 = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02);
- constexpr f32 cot_c6 = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03);
- constexpr f32 cot_c8 = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04);
- constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05);
- constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07);
-
- const vec<f32, N> x2 = x * x;
- const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
- tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12);
-
- const vec<f32, N> z = select(inverse, val / -x, val * x);
- return mulsign(z, x_full);
-}
-
-template <size_t N>
-KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full)
-{
- mask<f64, N> inverse;
- vec<i64, N> quad;
- const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
- inverse = quad == 2 || quad == 6;
-
- constexpr f64 tan_c2 = CMT_FP(0x5.5555554d8e5b8p-4, 3.333333332201594557e-01);
- constexpr f64 tan_c4 = CMT_FP(0x2.222224820264p-4, 1.333333421790934281e-01);
- constexpr f64 tan_c6 = CMT_FP(0xd.d0d90de32b3e8p-8, 5.396801556632355862e-02);
- constexpr f64 tan_c8 = CMT_FP(0x5.99723bdcf5cacp-8, 2.187265359403693307e-02);
- constexpr f64 tan_c10 = CMT_FP(0x2.434a142e413ap-8, 8.839254309582239566e-03);
- constexpr f64 tan_c12 = CMT_FP(0xf.2b59061305efp-12, 3.703449009834865711e-03);
- constexpr f64 tan_c14 = CMT_FP(0x4.a12565071a664p-12, 1.130243370829653185e-03);
- constexpr f64 tan_c16 = CMT_FP(0x4.dada3797ac1bcp-12, 1.185276423238536747e-03);
- constexpr f64 tan_c18 = CMT_FP(-0x1.a74976b6ea3f3p-12, -4.036779095551438937e-04);
- constexpr f64 tan_c20 = CMT_FP(0x1.d06a5ae5e4a74p-12, 4.429010863244216712e-04);
-
- constexpr f64 cot_c2 = CMT_FP(-0x5.5555555555554p-4, -3.333333333333333148e-01);
- constexpr f64 cot_c4 = CMT_FP(-0x5.b05b05b05b758p-8, -2.222222222222377391e-02);
- constexpr f64 cot_c6 = CMT_FP(-0x8.ab355dffc79a8p-12, -2.116402116358796163e-03);
- constexpr f64 cot_c8 = CMT_FP(-0xd.debbca405c9f8p-16, -2.116402122295888289e-04);
- constexpr f64 cot_c10 = CMT_FP(-0x1.66a8edb99b15p-16, -2.137779458737224013e-05);
- constexpr f64 cot_c12 = CMT_FP(-0x2.450239be0ee92p-20, -2.164426049513111728e-06);
- constexpr f64 cot_c14 = CMT_FP(-0x3.ad6ddb4719438p-24, -2.191935496317727080e-07);
- constexpr f64 cot_c16 = CMT_FP(-0x5.ff4c42741356p-28, -2.234152473099993830e-08);
- constexpr f64 cot_c18 = CMT_FP(-0x9.06881bcdf3108p-32, -2.101416316020595077e-09);
- constexpr f64 cot_c20 = CMT_FP(-0x1.644abedc113cap-32, -3.240456633529511097e-10);
-
- const vec<f64, N> x2 = x * x;
- const vec<f64, N> val = trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6,
- cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14,
- cot_c16, tan_c16, cot_c18, tan_c18, cot_c20, tan_c20);
-
- const vec<f64, N> z = select(inverse, val / -x, val * x);
- return mulsign(z, x_full);
-}
-
-KFR_I_FLT_CONVERTER(tan)
-template <typename T>
-KFR_SINTRIN flt_type<T> tandeg(const T& x)
-{
- return tan(x * c_degtorad<flt_type<T>>);
-}
-} // namespace intrinsics
-KFR_I_FN(tan)
-KFR_I_FN(tandeg)
-
-} // namespace kfr
diff --git a/include/kfr/base/intrinsics.h b/include/kfr/base/intrinsics.h
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "kfr.h"
-
-#ifdef CMT_ARCH_SSE2
-#include <immintrin.h>
-#ifdef CMT_OS_WIN
-#include <intrin.h>
-#endif
-#endif
-
-#ifdef CMT_ARCH_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined CMT_COMPILER_GCC && defined CMT_ARCH_X86
-#include <x86intrin.h>
-#endif
diff --git a/include/kfr/base/kfr.h b/include/kfr/base/kfr.h
@@ -1,46 +0,0 @@
-/** @addtogroup utility
- * @{
- */
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "../cident.h"
-
-#define KFR_VERSION_MAJOR 3
-#define KFR_VERSION_MINOR 0
-#define KFR_VERSION_BUILD 4
-#define KFR_VERSION_STRING \
- CMT_STRINGIFY(KFR_VERSION_MAJOR) "." CMT_STRINGIFY(KFR_VERSION_MINOR) "." CMT_STRINGIFY(KFR_VERSION_BUILD)
-#define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_BUILD)
-
-#ifdef CMT_ARCH_X64
-#define KFR_VERSION_FULL \
- "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 64-bit (" CMT_COMPIER_NAME "/" CMT_OS_NAME \
- ")"
-#else
-#define KFR_VERSION_FULL \
- "KFR " KFR_VERSION_STRING " " CMT_STRINGIFY(CMT_ARCH_NAME) " 32-bit (" CMT_COMPIER_NAME "/" CMT_OS_NAME \
- ")"
-#endif
-
-#ifdef __cplusplus
-namespace kfr
-{
-/// @brief KFR version string
-constexpr const char version_string[] = KFR_VERSION_STRING;
-
-constexpr int version_major = KFR_VERSION_MAJOR;
-constexpr int version_minor = KFR_VERSION_MINOR;
-constexpr int version_build = KFR_VERSION_BUILD;
-constexpr int version = KFR_VERSION;
-
-/// @brief KFR version string including architecture and compiler name
-constexpr const char version_full[] = KFR_VERSION_FULL;
-} // namespace kfr
-#endif
-
-#define KFR_INTRIN CMT_INTRIN
-#define KFR_FUNC CMT_FUNC
-#define KFR_SINTRIN CMT_INTRIN static
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -1,229 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/log_exp.hpp"
-
-namespace kfr
-{
-
-/// @brief Returns e raised to the given power x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> exp(const T1& x)
-{
- return intrinsics::exp(x);
-}
-
-/// @brief Returns e raised to the given power x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::exp, E1> exp(E1&& x)
-{
- return { fn::exp(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 2 raised to the given power x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> exp2(const T1& x)
-{
- return intrinsics::exp2(x);
-}
-
-/// @brief Returns 2 raised to the given power x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::exp2, E1> exp2(E1&& x)
-{
- return { fn::exp2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns 10 raised to the given power x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> exp10(const T1& x)
-{
- return intrinsics::exp10(x);
-}
-
-/// @brief Returns 10 raised to the given power x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::exp10, E1> exp10(E1&& x)
-{
- return { fn::exp10(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the natural logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> log(const T1& x)
-{
- return intrinsics::log(x);
-}
-
-/// @brief Returns the natural logarithm of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::log, E1> log(E1&& x)
-{
- return { fn::log(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the binary (base-2) logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> log2(const T1& x)
-{
- return intrinsics::log2(x);
-}
-
-/// @brief Returns the binary (base-2) logarithm of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::log2, E1> log2(E1&& x)
-{
- return { fn::log2(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the common (base-10) logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> log10(const T1& x)
-{
- return intrinsics::log10(x);
-}
-
-/// @brief Returns the common (base-10) logarithm of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::log10, E1> log10(E1&& x)
-{
- return { fn::log10(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the rounded binary (base-2) logarithm of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> logb(const T1& x)
-{
- return intrinsics::logb(x);
-}
-
-/// @brief Returns the rounded binary (base-2) logarithm of the x. Version that accepts and returns
-/// expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::logb, E1> logb(E1&& x)
-{
- return { fn::logb(), std::forward<E1>(x) };
-}
-
-/// @brief Returns the logarithm of the x with base y.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> logn(const T1& x, const T2& y)
-{
- return intrinsics::logn(x, y);
-}
-
-/// @brief Returns the logarithm of the x with base y. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::logn, E1, E2> logn(E1&& x, E2&& y)
-{
- return { fn::logn(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns log(x) * y.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> logm(const T1& x, const T2& y)
-{
- return intrinsics::logm(x, y);
-}
-
-/// @brief Returns log(x) * y. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::logm, E1, E2> logm(E1&& x, E2&& y)
-{
- return { fn::logm(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns exp(x * m + a).
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_FUNC flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& y, const T3& z)
-{
- return intrinsics::exp_fmadd(x, y, z);
-}
-
-/// @brief Returns exp(x * m + a). Version that accepts and returns expressions.
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_FUNC internal::expression_function<fn::exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z)
-{
- return { fn::exp_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
-}
-
-/// @brief Returns log(x) * m + a.
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_FUNC flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& y, const T3& z)
-{
- return intrinsics::log_fmadd(x, y, z);
-}
-
-/// @brief Returns log(x) * m + a. Version that accepts and returns expressions.
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_FUNC internal::expression_function<fn::log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z)
-{
- return { fn::log_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
-}
-
-/// @brief Returns the x raised to the given power y.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> pow(const T1& x, const T2& y)
-{
- return intrinsics::pow(x, y);
-}
-
-/// @brief Returns the x raised to the given power y. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::pow, E1, E2> pow(E1&& x, E2&& y)
-{
- return { fn::pow(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns the real nth root of the x.
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
-KFR_FUNC flt_type<common_type<T1, T2>> root(const T1& x, const T2& y)
-{
- return intrinsics::root(x, y);
-}
-
-/// @brief Returns the real nth root of the x. Version that accepts and returns expressions.
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_FUNC internal::expression_function<fn::root, E1, E2> root(E1&& x, E2&& y)
-{
- return { fn::root(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Returns the cube root of the x.
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cbrt(const T1& x)
-{
- return intrinsics::cbrt(x);
-}
-
-/// @brief Returns the cube root of the x. Version that accepts and returns expressions.
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cbrt, E1> cbrt(E1&& x)
-{
- return { fn::cbrt(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -1,50 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/logical.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns x[0] && x[1] && ... && x[N-1]
- */
-template <typename T, size_t N>
-KFR_SINTRIN bool all(const mask<T, N>& x)
-{
- return intrinsics::bittestall(x.asvec());
-}
-
-/**
- * @brief Returns x[0] || x[1] || ... || x[N-1]
- */
-template <typename T, size_t N>
-KFR_SINTRIN bool any(const mask<T, N>& x)
-{
- return intrinsics::bittestany(x.asvec());
-}
-} // namespace kfr
diff --git a/include/kfr/base/memory.hpp b/include/kfr/base/memory.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup utility
+/** @addtogroup memory
* @{
*/
/*
@@ -25,8 +25,8 @@
*/
#pragma once
-#include "read_write.hpp"
-#include "types.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/types.hpp"
#include <algorithm>
#include <atomic>
#include <memory>
@@ -34,7 +34,7 @@
namespace kfr
{
-namespace internal
+namespace internal_generic
{
struct memory_statistics
@@ -51,6 +51,8 @@ inline memory_statistics& get_memory_statistics()
return ms;
}
+#pragma pack(push, 1)
+
struct mem_header
{
u8 offset;
@@ -60,13 +62,18 @@ struct mem_header
unsigned int references_uint;
size_t size;
- CMT_INLINE std::atomic_uint& references() { return reinterpret_cast<std::atomic_uint&>(references_uint); }
+ KFR_MEM_INTRINSIC std::atomic_uint& references()
+ {
+ return reinterpret_cast<std::atomic_uint&>(references_uint);
+ }
}
#ifdef CMT_GNU_ATTRIBUTES
__attribute__((__packed__))
#endif
;
+#pragma pack(pop)
+
inline mem_header* aligned_header(void* ptr) { return ptr_cast<mem_header>(ptr) - 1; }
inline size_t aligned_size(void* ptr) { return aligned_header(ptr)->size; }
@@ -103,58 +110,58 @@ inline void aligned_free(void* ptr)
}
inline void aligned_release(void* ptr) { aligned_free(ptr); }
-} // namespace internal
+} // namespace internal_generic
/// @brief Allocates aligned memory
template <typename T = void, size_t alignment = platform<>::native_cache_alignment>
-CMT_INLINE T* aligned_allocate(size_t size = 1)
+KFR_INTRINSIC T* aligned_allocate(size_t size = 1)
{
T* ptr = static_cast<T*>(CMT_ASSUME_ALIGNED(
- internal::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment),
+ internal_generic::aligned_malloc(std::max(alignment, size * details::elementsize<T>()), alignment),
alignment));
return ptr;
}
/// @brief Deallocates aligned memory
template <typename T = void>
-CMT_INLINE void aligned_deallocate(T* ptr)
+KFR_INTRINSIC void aligned_deallocate(T* ptr)
{
- return internal::aligned_free(ptr);
+ return internal_generic::aligned_free(ptr);
}
-namespace internal
+namespace internal_generic
{
template <typename T>
struct aligned_deleter
{
- CMT_INLINE void operator()(T* ptr) const { aligned_deallocate(ptr); }
+ KFR_MEM_INTRINSIC void operator()(T* ptr) const { aligned_deallocate(ptr); }
};
-} // namespace internal
+} // namespace internal_generic
template <typename T>
struct autofree
{
- CMT_INLINE autofree() {}
- explicit CMT_INLINE autofree(size_t size) : ptr(aligned_allocate<T>(size)) {}
+ KFR_MEM_INTRINSIC autofree() {}
+ explicit KFR_MEM_INTRINSIC autofree(size_t size) : ptr(aligned_allocate<T>(size)) {}
autofree(const autofree&) = delete;
autofree& operator=(const autofree&) = delete;
- autofree(autofree&&) noexcept = default;
- autofree& operator=(autofree&&) noexcept = default;
- CMT_INLINE T& operator[](size_t index) noexcept { return ptr[index]; }
- CMT_INLINE const T& operator[](size_t index) const noexcept { return ptr[index]; }
+ autofree(autofree&&) CMT_NOEXCEPT = default;
+ autofree& operator=(autofree&&) CMT_NOEXCEPT = default;
+ KFR_MEM_INTRINSIC T& operator[](size_t index) CMT_NOEXCEPT { return ptr[index]; }
+ KFR_MEM_INTRINSIC const T& operator[](size_t index) const CMT_NOEXCEPT { return ptr[index]; }
template <typename U = T>
- CMT_INLINE U* data() noexcept
+ KFR_MEM_INTRINSIC U* data() CMT_NOEXCEPT
{
return ptr_cast<U>(ptr.get());
}
template <typename U = T>
- CMT_INLINE const U* data() const noexcept
+ KFR_MEM_INTRINSIC const U* data() const CMT_NOEXCEPT
{
return ptr_cast<U>(ptr.get());
}
- std::unique_ptr<T[], internal::aligned_deleter<T>> ptr;
+ std::unique_ptr<T[], internal_generic::aligned_deleter<T>> ptr;
};
#ifdef KFR_USE_STD_ALLOCATION
@@ -181,14 +188,14 @@ struct allocator
{
using other = allocator<U>;
};
- constexpr allocator() noexcept = default;
- constexpr allocator(const allocator&) noexcept = default;
+ constexpr allocator() CMT_NOEXCEPT = default;
+ constexpr allocator(const allocator&) CMT_NOEXCEPT = default;
template <typename U>
- constexpr allocator(const allocator<U>&) noexcept
+ constexpr allocator(const allocator<U>&) CMT_NOEXCEPT
{
}
- pointer address(reference x) const noexcept { return std::addressof(x); }
- const_pointer address(const_reference x) const noexcept { return std::addressof(x); }
+ pointer address(reference x) const CMT_NOEXCEPT { return std::addressof(x); }
+ const_pointer address(const_reference x) const CMT_NOEXCEPT { return std::addressof(x); }
pointer allocate(size_type n, std::allocator<void>::const_pointer = 0) const
{
pointer result = aligned_allocate<value_type>(n);
@@ -211,12 +218,12 @@ struct allocator
};
template <typename T1, typename T2>
-constexpr inline bool operator==(const allocator<T1>&, const allocator<T2>&) noexcept
+constexpr inline bool operator==(const allocator<T1>&, const allocator<T2>&) CMT_NOEXCEPT
{
return true;
}
template <typename T1, typename T2>
-constexpr inline bool operator!=(const allocator<T1>&, const allocator<T2>&) noexcept
+constexpr inline bool operator!=(const allocator<T1>&, const allocator<T2>&) CMT_NOEXCEPT
{
return false;
}
@@ -243,4 +250,5 @@ public:
\
private: \
mutable std::atomic_uintptr_t m_refcount = ATOMIC_VAR_INIT(0);
+
} // namespace kfr
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -1,107 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/min_max.hpp"
-
-namespace kfr
-{
-/**
- * @brief Returns the smaller of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout min(const T1& x, const T2& y)
-{
- return intrinsics::min(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the smaller of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::min, E1, E2> min(E1&& x, E2&& y)
-{
- return { fn::min(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the greater of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout max(const T1& x, const T2& y)
-{
- return intrinsics::max(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the greater of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::max, E1, E2> max(E1&& x, E2&& y)
-{
- return { fn::max(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the smaller in magnitude of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout absmin(const T1& x, const T2& y)
-{
- return intrinsics::absmin(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the smaller in magnitude of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::absmin, E1, E2> absmin(E1&& x, E2&& y)
-{
- return { fn::absmin(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/**
- * @brief Returns the greater in magnitude of two values.
- */
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout absmax(const T1& x, const T2& y)
-{
- return intrinsics::absmax(x, y);
-}
-
-/**
- * @brief Returns template expression that returns the greater in magnitude of two values.
- */
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::absmax, E1, E2> absmax(E1&& x, E2&& y)
-{
- return { fn::absmax(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp
@@ -1,44 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/modzerobessel.hpp"
-
-namespace kfr
-{
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 modzerobessel(const T1& x)
-{
- return intrinsics::modzerobessel(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1&& x)
-{
- return { fn::modzerobessel(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -1,552 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "bitwise.hpp"
-#include "function.hpp"
-#include <algorithm>
-#include <utility>
-
-namespace kfr
-{
-
-template <typename T>
-constexpr inline T add(const T& x)
-{
- return x;
-}
-
-/**
- * @brief Returns sum of all the arguments passed to a function.
- */
-template <typename T1, typename T2, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, T2, Ts...>::value)>
-constexpr inline common_type<T1, T2, Ts...> add(const T1& x, const T2& y, const Ts&... rest)
-{
- return x + add(y, rest...);
-}
-template <typename T>
-constexpr inline T add(initialvalue<T>)
-{
- return T(0);
-}
-KFR_FN(add)
-
-/**
- * @brief Returns template expression that returns sum of all the arguments passed to a function.
- */
-template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)>
-CMT_INLINE internal::expression_function<fn::add, E...> add(E&&... x)
-{
- return { fn::add(), std::forward<E>(x)... };
-}
-
-template <typename T1, typename T2>
-constexpr inline common_type<T1, T2> sub(const T1& x, const T2& y)
-{
- return x - y;
-}
-template <typename T>
-constexpr inline T sub(initialvalue<T>)
-{
- return T(0);
-}
-KFR_FN(sub)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::sub, E1, E2> sub(E1&& x, E2&& y)
-{
- return { fn::sub(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-template <typename T1>
-constexpr inline T1 mul(const T1& x)
-{
- return x;
-}
-
-/**
- * @brief Returns product of all the arguments passed to a function.
- */
-template <typename T1, typename T2, typename... Ts>
-constexpr inline common_type<T1, T2, Ts...> mul(const T1& x, const T2& y, const Ts&... rest)
-{
- return x * mul(y, rest...);
-}
-
-template <typename T>
-constexpr inline T mul(initialvalue<T>)
-{
- return T(1);
-}
-KFR_FN(mul)
-
-/**
- * @brief Returns template expression that returns product of all the arguments passed to a function.
- */
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::mul, E...> mul(E&&... x)
-{
- return { fn::mul(), std::forward<E>(x)... };
-}
-
-/**
- * @brief Returns square of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-constexpr inline T1 sqr(const T1& x)
-{
- return x * x;
-}
-KFR_FN(sqr)
-
-/**
- * @brief Returns template expression that returns square of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::sqr, E1> sqr(E1&& x)
-{
- return { fn::sqr(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns cube of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-constexpr inline T1 cub(const T1& x)
-{
- return sqr(x) * x;
-}
-KFR_FN(cub)
-
-/**
- * @brief Returns template expression that returns cube of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::cub, E1> cub(E1&& x)
-{
- return { fn::cub(), std::forward<E1>(x) };
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow2(const T& x)
-{
- return sqr(x);
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow3(const T& x)
-{
- return cub(x);
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow4(const T& x)
-{
- return sqr(sqr(x));
-}
-
-template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
-constexpr CMT_INLINE T pow5(const T& x)
-{
- return pow4(x) * x;
-}
-KFR_FN(pow2)
-KFR_FN(pow3)
-KFR_FN(pow4)
-KFR_FN(pow5)
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow2, E1> pow2(E1&& x)
-{
- return { fn::pow2(), std::forward<E1>(x) };
-}
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow3, E1> pow3(E1&& x)
-{
- return { fn::pow3(), std::forward<E1>(x) };
-}
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow4, E1> pow4(E1&& x)
-{
- return { fn::pow4(), std::forward<E1>(x) };
-}
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::pow5, E1> pow5(E1&& x)
-{
- return { fn::pow5(), std::forward<E1>(x) };
-}
-
-/// Raise x to the power base \f$ x^{base} \f$
-/// @code
-/// CHECK( ipow( 10, 3 ) == 1000 );
-/// CHECK( ipow( 0.5, 2 ) == 0.25 );
-/// @endcode
-template <typename T>
-constexpr inline T ipow(const T& x, int base)
-{
- T xx = x;
- T result = T(1);
- while (base)
- {
- if (base & 1)
- result *= xx;
- base >>= 1;
- xx *= xx;
- }
- return result;
-}
-KFR_FN(ipow)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::ipow, E1, E2> ipow(E1&& x, E2&& b)
-{
- return { fn::ipow(), std::forward<E1>(x), std::forward<E2>(b) };
-}
-
-/// Return square of the sum of all arguments
-/// @code
-/// CHECK(sqrsum(1,2,3) == 36);
-/// @endcode
-template <typename T1, typename... Ts>
-constexpr inline common_type<T1, Ts...> sqrsum(const T1& x, const Ts&... rest)
-{
- return sqr(add(x, rest...));
-}
-
-template <typename T1, typename T2>
-constexpr inline common_type<T1, T2> sqrdiff(const T1& x, const T2& y)
-{
- return sqr(x - y);
-}
-KFR_FN(sqrsum)
-KFR_FN(sqrdiff)
-
-/// Division
-template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
-CMT_INLINE Tout div(const T1& x, const T2& y)
-{
- return static_cast<Tout>(x) / static_cast<Tout>(y);
-}
-KFR_FN(div)
-
-/// Remainder
-template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
-CMT_INLINE Tout rem(const T1& x, const T2& y)
-{
- return static_cast<Tout>(x) % static_cast<Tout>(y);
-}
-KFR_FN(rem)
-
-/// Negation
-template <typename T1>
-inline T1 neg(const T1& x)
-{
- return -x;
-}
-KFR_FN(neg)
-
-/// @brief Fused Multiply-Add
-template <typename T1, typename T2, typename T3>
-KFR_INTRIN constexpr common_type<T1, T2, T3> fmadd(const T1& x, const T2& y, const T3& z)
-{
- return x * y + z;
-}
-/// @brief Fused Multiply-Sub
-template <typename T1, typename T2, typename T3>
-KFR_INTRIN constexpr common_type<T1, T2, T3> fmsub(const T1& x, const T2& y, const T3& z)
-{
- return x * y - z;
-}
-KFR_FN(fmadd)
-KFR_FN(fmsub)
-
-/// @brief Linear blend of `x` and `y` (`c` must be in the range 0...+1)
-/// Returns `x + ( y - x ) * c`
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INTRIN constexpr common_type<T1, T2, T3> mix(const T1& c, const T2& x, const T3& y)
-{
- return fmadd(c, y - x, x);
-}
-
-/// @brief Linear blend of `x` and `y` (`c` must be in the range -1...+1)
-template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
-KFR_INTRIN constexpr common_type<T1, T2, T3> mixs(const T1& c, const T2& x, const T3& y)
-{
- return mix(fmadd(c, 0.5, 0.5), x, y);
-}
-KFR_FN(mix)
-KFR_FN(mixs)
-
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-CMT_INLINE internal::expression_function<fn::mix, E1, E2, E3> mix(E1&& c, E2&& x, E3&& y)
-{
- return { fn::mix(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
-}
-
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-CMT_INLINE internal::expression_function<fn::mixs, E1, E2, E3> mixs(E1&& c, E2&& x, E3&& y)
-{
- return { fn::mixs(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
-}
-
-namespace internal
-{
-
-template <typename T1, typename T2>
-constexpr CMT_INLINE common_type<T1, T2> horner(const T1&, const T2& c0)
-{
- return c0;
-}
-
-template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner(const T1& x, const T2& c0, const T3& c1,
- const Ts&... values)
-{
- return fmadd(horner(x, c1, values...), x, c0);
-}
-
-template <typename T1, typename T2>
-constexpr CMT_INLINE common_type<T1, T2> horner_even(const T1&, const T2& c0)
-{
- return c0;
-}
-
-template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner_even(const T1& x, const T2& c0, const T3& c2,
- const Ts&... values)
-{
- const T1 x2 = x * x;
- return fmadd(horner(x2, c2, values...), x2, c0);
-}
-
-template <typename T1, typename T2>
-constexpr CMT_INLINE common_type<T1, T2> horner_odd(const T1& x, const T2& c1)
-{
- return c1 * x;
-}
-
-template <typename T1, typename T2, typename T3, typename... Ts>
-constexpr CMT_INLINE common_type<T1, T2, T3, Ts...> horner_odd(const T1& x, const T2& c1, const T3& c3,
- const Ts&... values)
-{
- const T1 x2 = x * x;
- return fmadd(horner(x2, c3, values...), x2, c1) * x;
-}
-} // namespace internal
-
-/// @brief Calculate polynomial using Horner's method
-///
-/// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\)
-template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
-constexpr CMT_INLINE common_type<T1, Ts...> horner(const T1& x, const Ts&... c)
-{
- return internal::horner(x, c...);
-}
-KFR_FN(horner)
-
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::horner, E...> horner(E&&... x)
-{
- return { fn::horner(), std::forward<E>(x)... };
-}
-
-/// @brief Calculate polynomial using Horner's method (even powers)
-///
-/// ``horner_even(x, 1, 2, 3)`` is equivalent to \(3x^4 + 2x^2 + 1\)
-template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
-constexpr CMT_INLINE common_type<T1, Ts...> horner_even(const T1& x, const Ts&... c)
-{
- return internal::horner_even(x, c...);
-}
-KFR_FN(horner_even)
-
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::horner_even, E...> horner_even(E&&... x)
-{
- return { fn::horner_even(), std::forward<E>(x)... };
-}
-
-/// @brief Calculate polynomial using Horner's method (odd powers)
-///
-/// ``horner_odd(x, 1, 2, 3)`` is equivalent to \(3x^5 + 2x^3 + 1x\)
-template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
-constexpr CMT_INLINE common_type<T1, Ts...> horner_odd(const T1& x, const Ts&... c)
-{
- return internal::horner_odd(x, c...);
-}
-KFR_FN(horner_odd)
-
-template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
-CMT_INLINE internal::expression_function<fn::horner_odd, E...> horner_odd(E&&... x)
-{
- return { fn::horner_odd(), std::forward<E>(x)... };
-}
-
-/// @brief Calculate Multiplicative Inverse of `x`
-/// Returns `1/x`
-template <typename T>
-constexpr CMT_INLINE T reciprocal(const T& x)
-{
- static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type");
- return subtype<T>(1) / x;
-}
-KFR_FN(reciprocal)
-
-template <typename T1, typename T2>
-CMT_INLINE common_type<T1, T2> mulsign(const T1& x, const T2& y)
-{
- return bitwisexor(x, bitwiseand(y, constants<T2>::highbitmask()));
-}
-KFR_FN(mulsign)
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
-{
- return (x & constants<T>::highbitmask()) | (y & constants<T>::highbitmask());
-}
-
-/// @brief Swap byte order
-template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)>
-CMT_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x)
-{
- return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x)));
-}
-template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)>
-CMT_INLINE T swapbyteorder(const T& x)
-{
- return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x)));
-}
-template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)>
-CMT_INLINE T swapbyteorder(const T& x)
-{
- return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x)));
-}
-template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)>
-CMT_INLINE T swapbyteorder(const T& x)
-{
- return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x)));
-}
-KFR_FN(swapbyteorder)
-
-template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
-{
- return blend<1, 0>(a + b, a - b);
-}
-template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
-{
- return blend<0, 1>(a + b, a - b);
-}
-KFR_FN(subadd)
-KFR_FN(addsub)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> negeven(const vec<T, N>& x)
-{
- return x ^ broadcast<N>(-T(), T());
-}
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> negodd(const vec<T, N>& x)
-{
- return x ^ broadcast<N>(T(), -T());
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::neg, E1> operator-(E1&& e1)
-{
- return { fn::neg(), std::forward<E1>(e1) };
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_INLINE internal::expression_function<fn::bitwisenot, E1> operator~(E1&& e1)
-{
- return { fn::bitwisenot(), std::forward<E1>(e1) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::add, E1, E2> operator+(E1&& e1, E2&& e2)
-{
- return { fn::add(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::sub, E1, E2> operator-(E1&& e1, E2&& e2)
-{
- return { fn::sub(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::mul, E1, E2> operator*(E1&& e1, E2&& e2)
-{
- return { fn::mul(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::div, E1, E2> operator/(E1&& e1, E2&& e2)
-{
- return { fn::div(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::bitwiseand, E1, E2> operator&(E1&& e1, E2&& e2)
-{
- return { fn::bitwiseand(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::bitwiseor, E1, E2> operator|(E1&& e1, E2&& e2)
-{
- return { fn::bitwiseor(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::bitwisexor, E1, E2> operator^(E1&& e1, E2&& e2)
-{
- return { fn::bitwisexor(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::shl, E1, E2> operator<<(E1&& e1, E2&& e2)
-{
- return { fn::shl(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::shr, E1, E2> operator>>(E1&& e1, E2&& e2)
-{
- return { fn::shr(), std::forward<E1>(e1), std::forward<E2>(e2) };
-}
-
-template <typename T, size_t N1, size_t... Ns>
-vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest)
-{
- const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...));
- return compcast<vec<T, sizeof...(Ns) + 1>>(t);
-}
-
-KFR_FN(packtranspose)
-} // namespace kfr
diff --git a/include/kfr/base/platform.hpp b/include/kfr/base/platform.hpp
@@ -1,186 +0,0 @@
-/** @addtogroup types
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "types.hpp"
-
-namespace kfr
-{
-
-/// @brief An enumeration representing cpu instruction set
-enum class cpu_t : int
-{
- common = 0,
-#ifdef CMT_ARCH_X86
- sse2 = 1,
- sse3 = 2,
- ssse3 = 3,
- sse41 = 4,
- sse42 = 5,
- avx1 = 6,
- avx2 = 7,
- avx512 = 8, // F, CD, VL, DQ and BW
- avx = static_cast<int>(avx1),
- lowest = static_cast<int>(sse2),
- highest = static_cast<int>(avx512),
-#endif
-#ifdef CMT_ARCH_ARM
- neon = 1,
- neon64 = 2,
- lowest = static_cast<int>(neon),
- highest = static_cast<int>(neon64),
-#endif
- native = static_cast<int>(CMT_ARCH_NAME),
- runtime = -1,
-};
-
-#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native
-
-template <cpu_t cpu>
-using ccpu_t = cval_t<cpu_t, cpu>;
-
-template <cpu_t cpu>
-constexpr ccpu_t<cpu> ccpu{};
-
-namespace internal
-{
-constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
-constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
-
-#ifdef CMT_ARCH_X86
-constexpr auto cpu_list = cvals_t<cpu_t, cpu_t::avx512, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3,
- cpu_t::sse3, cpu_t::sse2>();
-#else
-constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>;
-#endif
-} // namespace internal
-
-template <cpu_t cpu>
-using cpuval_t = cval_t<cpu_t, cpu>;
-template <cpu_t cpu>
-constexpr auto cpuval = cpuval_t<cpu>{};
-
-constexpr auto cpu_all = cfilter(internal::cpu_list, internal::cpu_list >= cpuval_t<cpu_t::native>());
-
-/// @brief Returns name of the cpu instruction set
-CMT_UNUSED static const char* cpu_name(cpu_t set)
-{
-#ifdef CMT_ARCH_X86
- static const char* names[] = { "common", "sse2", "sse3", "ssse3", "sse41",
- "sse42", "avx1", "avx2", "avx512" };
-#endif
-#ifdef CMT_ARCH_ARM
- static const char* names[] = { "common", "neon", "neon64" };
-#endif
- if (set >= cpu_t::lowest && set <= cpu_t::highest)
- return names[static_cast<size_t>(set)];
- return "-";
-}
-
-#ifdef CMT_ARCH_X64
-template <int = 0>
-constexpr inline const char* bitness_const(const char*, const char* x64)
-{
- return x64;
-}
-template <typename T>
-constexpr inline const T& bitness_const(const T&, const T& x64)
-{
- return x64;
-}
-#else
-template <int = 0>
-constexpr inline const char* bitness_const(const char* x32, const char*)
-{
- return x32;
-}
-template <typename T>
-constexpr inline const T& bitness_const(const T& x32, const T&)
-{
- return x32;
-}
-#endif
-
-template <typename T = i32, cpu_t c = cpu_t::native>
-struct platform
-{
- constexpr static size_t native_cache_alignment = 64;
- constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1;
- constexpr static size_t maximum_vector_alignment = 32;
- constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
-#ifdef CMT_ARCH_X86
- constexpr static size_t simd_register_count =
- c >= cpu_t::avx512 ? bitness_const(8, 32) : bitness_const(8, 16);
-#endif
-#ifdef CMT_ARCH_ARM
- constexpr static size_t simd_register_count = 16;
-#endif
-
- constexpr static size_t common_float_vector_size = 16;
- constexpr static size_t common_int_vector_size = 16;
-
-#ifdef CMT_ARCH_X86
- constexpr static size_t native_float_vector_size =
- c >= cpu_t::avx512 ? 64 : c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
-#endif
-#ifdef CMT_ARCH_ARM
- constexpr static size_t native_float_vector_size = c == cpu_t::neon ? 16 : common_float_vector_size;
-#endif
-#ifdef CMT_ARCH_X86
- constexpr static size_t native_int_vector_size =
- c >= cpu_t::avx512 ? 64 : c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
-#endif
-#ifdef CMT_ARCH_ARM
- constexpr static size_t native_int_vector_size = c == cpu_t::neon ? 16 : common_int_vector_size;
-#endif
-
- /// @brief SIMD vector width for the given cpu instruction set
- constexpr static size_t vector_width =
- (const_max(size_t(1), typeclass<T> == datatype::f ? native_float_vector_size / sizeof(T)
- : native_int_vector_size / sizeof(T)));
-
- constexpr static size_t vector_capacity = simd_register_count * vector_width;
-
- constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_capacity / 4);
-
- constexpr static size_t native_vector_alignment =
- const_max(native_float_vector_size, native_int_vector_size);
-
- constexpr static bool fast_unaligned =
-#ifdef CMT_ARCH_X86
- c >= cpu_t::avx1;
-#else
- false;
-#endif
-
- constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
-};
-
-template <typename T, size_t N = platform<T>::vector_width>
-struct vec;
-template <typename T, size_t N = platform<T>::vector_width>
-struct mask;
-} // namespace kfr
diff --git a/include/kfr/base/pointer.hpp b/include/kfr/base/pointer.hpp
@@ -25,14 +25,17 @@
*/
#pragma once
+#include "../simd/vec.hpp"
#include "basic_expressions.hpp"
-#include "vec.hpp"
#include <memory>
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
-constexpr size_t maximum_expression_width = platform<float>::vector_capacity / 4;
+template <typename T>
+constexpr size_t maximum_expression_width = vector_width<T> * 2;
template <typename T, bool enable_resource = true>
struct expression_pointer;
@@ -41,11 +44,11 @@ namespace internal
{
template <typename Expression, typename T, size_t key = 0>
-KFR_SINTRIN bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer,
- csize_t<key> = {});
+KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer,
+ csize_t<key> = {});
}
-template <typename T, size_t N = maximum_expression_width>
+template <typename T, size_t N = maximum_expression_width<T>>
struct expression_vtable : expression_vtable<T, N / 2>
{
using func_get = void (*)(void*, size_t, vec<T, N>&);
@@ -60,7 +63,7 @@ struct expression_vtable : expression_vtable<T, N / 2>
template <typename Expression>
static void static_get(void* instance, size_t index, vec<T, N>& result)
{
- result = static_cast<Expression*>(instance)->operator()(cinput, index, vec_t<T, N>());
+ result = get_elements(*static_cast<Expression*>(instance), cinput, index, vec_shape<T, N>());
}
};
@@ -78,7 +81,7 @@ struct expression_vtable<T, 0>
func_substitute substitute;
template <typename Expression>
- expression_vtable(ctype_t<Expression> t)
+ expression_vtable(ctype_t<Expression>)
: size(&expression_vtable<T, 0>::template static_size<Expression>),
begin_block(&expression_vtable<T, 0>::template static_begin_block<Expression>),
end_block(&expression_vtable<T, 0>::template static_end_block<Expression>),
@@ -117,7 +120,7 @@ struct expression_resource
template <typename E>
struct expression_resource_impl : expression_resource
{
- expression_resource_impl(E&& e) noexcept : e(std::move(e)) {}
+ expression_resource_impl(E&& e) CMT_NOEXCEPT : e(std::move(e)) {}
virtual ~expression_resource_impl() {}
virtual void* instance() override final { return &e; }
@@ -126,7 +129,7 @@ private:
};
template <typename E>
-KFR_SINTRIN std::shared_ptr<expression_resource> make_resource(E&& e)
+KFR_INTRINSIC std::shared_ptr<expression_resource> make_resource(E&& e)
{
using T = expression_resource_impl<decay<E>>;
return std::static_pointer_cast<expression_resource>(
@@ -138,31 +141,35 @@ struct expression_pointer : input_expression
{
using value_type = T;
- expression_pointer() noexcept : instance(nullptr), vtable(nullptr) {}
+ expression_pointer() CMT_NOEXCEPT : instance(nullptr), vtable(nullptr) {}
expression_pointer(void* instance, const expression_vtable<T>* vtable,
std::shared_ptr<expression_resource> resource = nullptr)
: instance(instance), vtable(vtable), resource(std::move(resource))
{
}
- template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width)>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+ template <size_t N, KFR_ENABLE_IF(N <= maximum_expression_width<T>)>
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t, size_t index,
+ vec_shape<T, N>)
{
static_assert(is_poweroftwo(N), "N must be a power of two");
vec<T, N> result;
- static_cast<const expression_vtable<T, N>*>(vtable)->get(instance, index, result);
+ static_cast<const expression_vtable<T, N>*>(self.vtable)->get(self.instance, index, result);
return result;
}
- template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width)>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ template <size_t N, KFR_ENABLE_IF(N > maximum_expression_width<T>)>
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_pointer& self, cinput_t cinput,
+ size_t index, vec_shape<T, N>)
{
- return concat(operator()(cinput, index, vec_t<T, N / 2>()), operator()(cinput, index + N / 2,
- vec_t<T, N / 2>()));
+ static_assert(is_poweroftwo(N), "N must be a power of two");
+ const vec<T, N / 2> r1 = get_elements(self, cinput, index, vec_shape<T, N / 2>());
+ const vec<T, N / 2> r2 = get_elements(self, cinput, index + N / 2, vec_shape<T, N / 2>());
+ return concat(r1, r2);
}
- CMT_INLINE void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); }
- CMT_INLINE void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); }
- CMT_INLINE size_t size() const { return vtable->size(instance); }
+ KFR_MEM_INTRINSIC void begin_block(cinput_t, size_t size) const { vtable->begin_block(instance, size); }
+ KFR_MEM_INTRINSIC void end_block(cinput_t, size_t size) const { vtable->end_block(instance, size); }
+ KFR_MEM_INTRINSIC size_t size() const { return vtable->size(instance); }
- CMT_INLINE bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const
+ KFR_MEM_INTRINSIC bool substitute(expression_pointer<T>&& new_pointer, csize_t<0> = csize_t<0>{}) const
{
return vtable->substitute(instance, std::move(new_pointer));
}
@@ -179,7 +186,7 @@ namespace internal
{
template <typename T, typename E>
-CMT_INLINE expression_vtable<T>* make_expression_vtable()
+KFR_INTRINSIC expression_vtable<T>* make_expression_vtable()
{
static_assert(is_input_expression<E>::value, "E must be an expression");
static expression_vtable<T> vtable{ ctype_t<decay<E>>{} };
@@ -192,7 +199,7 @@ CMT_INLINE expression_vtable<T>* make_expression_vtable()
* @warning Use with caution with local variables.
*/
template <typename E, typename T = value_type_of<E>>
-CMT_INLINE expression_pointer<T> to_pointer(E& expr)
+KFR_INTRINSIC expression_pointer<T> to_pointer(E& expr)
{
static_assert(is_input_expression<E>::value, "E must be an expression");
return expression_pointer<T>(std::addressof(expr), internal::make_expression_vtable<T, E>());
@@ -203,7 +210,7 @@ CMT_INLINE expression_pointer<T> to_pointer(E& expr)
* @note Use std::move to force use of this overload.
*/
template <typename E, typename T = value_type_of<E>>
-CMT_INLINE expression_pointer<T> to_pointer(E&& expr)
+KFR_INTRINSIC expression_pointer<T> to_pointer(E&& expr)
{
static_assert(is_input_expression<E>::value, "E must be an expression");
std::shared_ptr<expression_resource> ptr = make_resource(std::move(expr));
@@ -215,24 +222,25 @@ template <typename T, size_t key>
class expression_placeholder : public input_expression
{
public:
- using value_type = T;
- expression_placeholder() noexcept = default;
+ using value_type = T;
+ expression_placeholder() CMT_NOEXCEPT = default;
template <typename U, size_t N>
- CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
+ friend KFR_INTRINSIC vec<U, N> get_elements(const expression_placeholder& self, cinput_t,
+ size_t index, vec_shape<U, N>)
{
- return pointer ? cast<U>(pointer(cinput, index, vec_t<T, N>())) : 0;
+ return self.pointer ? elemcast<U>(get_elements(self.pointer, cinput, index, vec_shape<T, N>())) : 0;
}
expression_pointer<T> pointer;
};
template <typename T, size_t key = 0>
-KFR_SINTRIN expression_placeholder<T, key> placeholder(csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC expression_placeholder<T, key> placeholder(csize_t<key> = csize_t<key>{})
{
return expression_placeholder<T, key>();
}
template <typename... Args>
-KFR_SINTRIN bool substitute(input_expression&, Args&&...)
+KFR_INTRINSIC bool substitute(input_expression&, Args&&...)
{
return false;
}
@@ -240,28 +248,28 @@ KFR_SINTRIN bool substitute(input_expression&, Args&&...)
namespace internal
{
template <typename... Args, typename T, size_t key, size_t... indices>
-KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer,
- csize_t<key>, csizes_t<indices...>);
+KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr,
+ expression_pointer<T>&& new_pointer, csize_t<key>, csizes_t<indices...>);
}
template <typename T, size_t key = 0>
-KFR_SINTRIN bool substitute(expression_placeholder<T, key>& expr, expression_pointer<T>&& new_pointer,
- csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC bool substitute(expression_placeholder<T, key>& expr, expression_pointer<T>&& new_pointer,
+ csize_t<key> = csize_t<key>{})
{
expr.pointer = std::move(new_pointer);
return true;
}
template <typename... Args, typename T, size_t key = 0>
-KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer,
- csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr,
+ expression_pointer<T>&& new_pointer, csize_t<key> = csize_t<key>{})
{
return internal::substitute(expr, std::move(new_pointer), csize_t<key>{}, indicesfor_t<Args...>{});
}
template <typename T, size_t key = 0>
-KFR_SINTRIN bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& new_pointer,
- csize_t<key> = csize_t<key>{})
+KFR_INTRINSIC bool substitute(expression_pointer<T>& expr, expression_pointer<T>&& new_pointer,
+ csize_t<key> = csize_t<key>{})
{
return expr.substitute(std::move(new_pointer), csize_t<key>{});
}
@@ -269,17 +277,17 @@ KFR_SINTRIN bool substitute(expression_pointer<T>& expr, expression_pointer<T>&&
namespace internal
{
-KFR_SINTRIN bool var_or() { return false; }
+KFR_INTRINSIC bool var_or() { return false; }
template <typename... Args>
-KFR_SINTRIN bool var_or(bool b, Args... args)
+KFR_INTRINSIC bool var_or(bool b, Args... args)
{
return b || var_or(args...);
}
template <typename... Args, typename T, size_t key, size_t... indices>
-KFR_SINTRIN bool substitute(internal::expression_base<Args...>& expr, expression_pointer<T>&& new_pointer,
- csize_t<key>, csizes_t<indices...>)
+KFR_INTRINSIC bool substitute(internal::expression_with_arguments<Args...>& expr,
+ expression_pointer<T>&& new_pointer, csize_t<key>, csizes_t<indices...>)
{
return var_or(substitute(std::get<indices>(expr.args), std::move(new_pointer), csize_t<key>())...);
}
@@ -290,10 +298,11 @@ namespace internal
{
template <typename Expression, typename T, size_t key>
-KFR_SINTRIN bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, csize_t<key>)
+KFR_INTRINSIC bool invoke_substitute(Expression& expr, expression_pointer<T>&& new_pointer, csize_t<key>)
{
return kfr::substitute(expr, std::move(new_pointer), csize_t<key>{});
}
} // namespace internal
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/random.hpp b/include/kfr/base/random.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup math
+/** @addtogroup random
* @{
*/
/*
@@ -24,55 +24,58 @@
See https://www.kfrlib.com for details.
*/
#pragma once
-#include "function.hpp"
-#include "operators.hpp"
-#include "shuffle.hpp"
-#include "vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
-using random_state = u32x4;
-
#ifndef KFR_DISABLE_READCYCLECOUNTER
-
struct seed_from_rdtsc_t
{
};
constexpr seed_from_rdtsc_t seed_from_rdtsc{};
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+using random_state = u32x4;
-#ifndef KFR_READCYCLECOUNTER
+#ifndef KFR_DISABLE_READCYCLECOUNTER
#ifdef CMT_COMPILER_CLANG
-#define KFR_READCYCLECOUNTER() __builtin_readcyclecounter()
+#define KFR_builtin_readcyclecounter() \
+ static_cast<u64>(__builtin_readcyclecounter()) // Intel C++ requires cast here
#else
-#define KFR_READCYCLECOUNTER() __rdtsc()
+#define KFR_builtin_readcyclecounter() static_cast<u64>(__rdtsc())
#endif
#endif
-#endif
-
struct random_bit_generator
{
-
#ifndef KFR_DISABLE_READCYCLECOUNTER
- random_bit_generator(seed_from_rdtsc_t) noexcept
- : state(bitcast<u32>(
- make_vector(KFR_READCYCLECOUNTER(), (KFR_READCYCLECOUNTER() << 11) ^ 0x710686d615e2257bull)))
+ KFR_MEM_INTRINSIC random_bit_generator(seed_from_rdtsc_t) CMT_NOEXCEPT
+ : state(bitcast<u32>(make_vector(KFR_builtin_readcyclecounter(),
+ (KFR_builtin_readcyclecounter() << 11) ^ 0x710686d615e2257bull)))
{
(void)operator()();
}
#endif
- random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) noexcept : state(x0, x1, x2, x3)
+ KFR_MEM_INTRINSIC random_bit_generator(u32 x0, u32 x1, u32 x2, u32 x3) CMT_NOEXCEPT
+ : state(x0, x1, x2, x3)
{
(void)operator()();
}
- random_bit_generator(u64 x0, u64 x1) noexcept : state(bitcast<u32>(make_vector(x0, x1)))
+ KFR_MEM_INTRINSIC random_bit_generator(u64 x0, u64 x1) CMT_NOEXCEPT
+ : state(bitcast<u32>(make_vector(x0, x1)))
{
(void)operator()();
}
- inline random_state operator()()
+ KFR_MEM_INTRINSIC random_state operator()()
{
const static random_state mul{ 214013u, 17405u, 214013u, 69069u };
const static random_state add{ 2531011u, 10395331u, 13737667u, 1u };
@@ -87,13 +90,13 @@ protected:
static_assert(sizeof(random_state) == 16, "sizeof(random_state) == 16");
template <size_t N, KFR_ENABLE_IF(N <= sizeof(random_state))>
-inline vec<u8, N> random_bits(random_bit_generator& gen)
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
{
return narrow<N>(bitcast<u8>(gen()));
}
template <size_t N, KFR_ENABLE_IF(N > sizeof(random_state))>
-inline vec<u8, N> random_bits(random_bit_generator& gen)
+KFR_INTRINSIC vec<u8, N> random_bits(random_bit_generator& gen)
{
constexpr size_t N2 = prev_poweroftwo(N - 1);
const vec<u8, N2> bits1 = random_bits<N2>(gen);
@@ -102,37 +105,37 @@ inline vec<u8, N> random_bits(random_bit_generator& gen)
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_integral<T>::value)>
-inline vec<T, N> random_uniform(random_bit_generator& gen)
+KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen)
{
return bitcast<T>(random_bits<N * sizeof(T)>(gen));
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_same<T, f32>::value)>
-inline vec<f32, N> randommantissa(random_bit_generator& gen)
+KFR_INTRINSIC vec<f32, N> randommantissa(random_bit_generator& gen)
{
return bitcast<f32>((random_uniform<u32, N>(gen) & 0x7FFFFFu) | 0x3f800000u) + 0.0f;
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_same<T, f64>::value)>
-inline vec<f64, N> randommantissa(random_bit_generator& gen)
+KFR_INTRINSIC vec<f64, N> randommantissa(random_bit_generator& gen)
{
return bitcast<f64>((random_uniform<u64, N>(gen) & 0x000FFFFFFFFFFFFFull) | 0x3FF0000000000000ull) + 0.0;
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-inline vec<T, N> random_uniform(random_bit_generator& gen)
+KFR_INTRINSIC vec<T, N> random_uniform(random_bit_generator& gen)
{
return randommantissa<T, N>(gen) - 1.f;
}
template <size_t N, typename T, KFR_ENABLE_IF(is_f_class<T>::value)>
-inline vec<T, N> random_range(random_bit_generator& gen, T min, T max)
+KFR_INTRINSIC vec<T, N> random_range(random_bit_generator& gen, T min, T max)
{
return mix(random_uniform<T, N>(gen), min, max);
}
template <size_t N, typename T, KFR_ENABLE_IF(!is_f_class<T>::value)>
-inline vec<T, N> random_range(random_bit_generator& gen, T min, T max)
+KFR_INTRINSIC vec<T, N> random_range(random_bit_generator& gen, T min, T max)
{
using big_type = findinttype<sqr(std::numeric_limits<T>::min()), sqr(std::numeric_limits<T>::max())>;
@@ -147,11 +150,11 @@ template <typename T>
struct expression_random_uniform : input_expression
{
using value_type = T;
- constexpr expression_random_uniform(const random_bit_generator& gen) noexcept : gen(gen) {}
+ constexpr expression_random_uniform(const random_bit_generator& gen) CMT_NOEXCEPT : gen(gen) {}
template <size_t N>
- vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+ friend vec<T, N> get_elements(const expression_random_uniform& self, cinput_t, size_t, vec_shape<T, N>)
{
- return random_uniform<T, N>(gen);
+ return random_uniform<T, N>(self.gen);
}
mutable random_bit_generator gen;
};
@@ -160,15 +163,16 @@ template <typename T>
struct expression_random_range : input_expression
{
using value_type = T;
- constexpr expression_random_range(const random_bit_generator& gen, T min, T max) noexcept
- : gen(gen), min(min), max(max)
+ constexpr expression_random_range(const random_bit_generator& gen, T min, T max) CMT_NOEXCEPT : gen(gen),
+ min(min),
+ max(max)
{
}
template <size_t N>
- vec<T, N> operator()(cinput_t, size_t, vec_t<T, N>) const
+ friend vec<T, N> get_elements(const expression_random_range& self, cinput_t, size_t, vec_shape<T, N>)
{
- return random_range<N, T>(gen, min, max);
+ return random_range<N, T>(self.gen, self.min, self.max);
}
mutable random_bit_generator gen;
const T min;
@@ -178,16 +182,15 @@ struct expression_random_range : input_expression
/// @brief Returns expression that returns pseudo random values
template <typename T>
-inline internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen)
+KFR_FUNCTION internal::expression_random_uniform<T> gen_random_uniform(const random_bit_generator& gen)
{
return internal::expression_random_uniform<T>(gen);
}
-
#ifndef KFR_DISABLE_READCYCLECOUNTER
/// @brief Returns expression that returns pseudo random values
template <typename T>
-inline internal::expression_random_uniform<T> gen_random_uniform()
+KFR_FUNCTION internal::expression_random_uniform<T> gen_random_uniform()
{
return internal::expression_random_uniform<T>(random_bit_generator(seed_from_rdtsc));
}
@@ -195,18 +198,19 @@ inline internal::expression_random_uniform<T> gen_random_uniform()
/// @brief Returns expression that returns pseudo random values of the given range
template <typename T>
-inline internal::expression_random_range<T> gen_random_range(const random_bit_generator& gen, T min, T max)
+KFR_FUNCTION internal::expression_random_range<T> gen_random_range(const random_bit_generator& gen, T min,
+ T max)
{
return internal::expression_random_range<T>(gen, min, max);
}
-
#ifndef KFR_DISABLE_READCYCLECOUNTER
/// @brief Returns expression that returns pseudo random values of the given range
template <typename T>
-inline internal::expression_random_range<T> gen_random_range(T min, T max)
+KFR_FUNCTION internal::expression_random_range<T> gen_random_range(T min, T max)
{
return internal::expression_random_range<T>(random_bit_generator(seed_from_rdtsc), min, max);
}
#endif
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp
@@ -1,239 +0,0 @@
-/** @addtogroup types
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "shuffle.hpp"
-#include "types.hpp"
-#include "vec.hpp"
-
-namespace kfr
-{
-
-template <size_t N, bool A = false, typename T>
-CMT_INLINE static vec<T, N> read(const T* src)
-{
- return vec<T, N>(src, cbool_t<A>());
-}
-
-template <bool A = false, size_t N, typename T>
-CMT_INLINE static void write(T* dest, const vec<T, N>& value)
-{
- value.write(dest, cbool_t<A>());
-}
-
-template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
-CMT_INLINE vec<T, Nout> gather(const T* base, size_t index, Indices... indices)
-{
- return make_vector(base[index], base[indices]...);
-}
-
-template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
-CMT_INLINE vec<T, Nout> gather(const T* base)
-{
- return make_vector(base[Index], base[Indices]...);
-}
-
-template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0>
-CMT_INLINE void scatter(const T* base, const vec<T, N>& value)
-{
- base[Index] = value[InIndex];
- scatter<Indices..., T, N, InIndex + 1>(base, value);
-}
-
-namespace internal
-{
-template <typename T, size_t N, size_t... Indices>
-CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
-{
- return make_vector(base[indices[Indices]]...);
-}
-template <size_t Nout, size_t Stride, typename T, size_t... Indices>
-CMT_INLINE vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>)
-{
- return make_vector(base[Indices * Stride]...);
-}
-template <size_t Nout, size_t groupsize, typename T, size_t... Indices>
-CMT_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>)
-{
- return make_vector(read<groupsize>(base + Indices * groupsize * stride)...);
-}
-} // namespace internal
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices)
-{
- return internal::gather(base, indices, csizeseq_t<N>());
-}
-
-template <size_t Nout, size_t groupsize = 1, typename T>
-CMT_INLINE vec<T, Nout * groupsize> gather_stride(const T* base, size_t stride)
-{
- return internal::gather_stride_s<Nout, groupsize>(base, stride, csizeseq_t<Nout>());
-}
-
-template <size_t Nout, size_t Stride, typename T>
-CMT_INLINE vec<T, Nout> gather_stride(const T* base)
-{
- return internal::gather_stride<Nout, Stride>(base, csizeseq_t<Nout>());
-}
-
-template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
-CMT_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>)
-{
- return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...);
-}
-template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
-{
- return gather_helper<groupsize>(base, offset, csizeseq_t<N>());
-}
-
-template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
-CMT_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value,
- csizes_t<Indices...>)
-{
- swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)),
- 0)... };
-}
-template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, size_t... Indices>
-CMT_INLINE void scatter_helper_s(T* base, size_t stride, const vec<T, Nout>& value, csizes_t<Indices...>)
-{
- swallow{ (write(base + groupsize * stride, slice<Indices * groupsize, groupsize>(value)), 0)... };
-}
-template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
-CMT_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
-{
- return scatter_helper<groupsize>(base, offset, value, csizeseq_t<N>());
-}
-
-template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
-CMT_INLINE void scatter_stride(T* base, const vec<T, Nout>& value, size_t stride)
-{
- return scatter_helper_s<groupsize>(base, stride, value, csizeseq_t<N>());
-}
-
-template <typename T, size_t groupsize = 1>
-struct stride_pointer : public stride_pointer<const T, groupsize>
-{
- template <size_t N>
- void write(const vec<T, N>& val, csize_t<N> = csize_t<N>())
- {
- kfr::scatter_stride<N, groupsize>(this->ptr, val);
- }
-};
-
-template <typename T, size_t groupsize>
-struct stride_pointer<const T, groupsize>
-{
- const T* ptr;
- const size_t stride;
-
- template <size_t N>
- vec<T, N> read(csize_t<N> = csize_t<N>())
- {
- return kfr::gather_stride<N, groupsize>(ptr, stride);
- }
-};
-
-template <typename T>
-constexpr T partial_masks[] = { constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- constants<T>::allones(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T(),
- T() };
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> partial_mask(size_t index)
-{
- static_assert(N <= arraysize(partial_masks<T>) / 2,
- "N must not be greater than half of partial_masks expression_array");
- return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index);
-}
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> partial_mask(size_t index, vec_t<T, N>)
-{
- return partial_mask<T, N>(index);
-}
-} // namespace kfr
diff --git a/include/kfr/base/reduce.hpp b/include/kfr/base/reduce.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup expressions
+/** @addtogroup array
* @{
*/
/*
@@ -25,39 +25,41 @@
*/
#pragma once
+#include "../math/min_max.hpp"
+#include "../simd/horizontal.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
#include "basic_expressions.hpp"
-#include "function.hpp"
-#include "horizontal.hpp"
-#include "min_max.hpp"
-#include "operators.hpp"
-#include "vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
template <typename T>
-CMT_INLINE T final_mean(T value, size_t size)
+KFR_INTRINSIC T final_mean(T value, size_t size)
{
return value / T(size);
}
KFR_FN(final_mean)
template <typename T>
-CMT_INLINE T final_rootmean(T value, size_t size)
+KFR_INTRINSIC T final_rootmean(T value, size_t size)
{
- return internal::builtin_sqrt(value / T(size));
+ return builtin_sqrt(value / T(size));
}
KFR_FN(final_rootmean)
namespace internal
{
template <typename FinalFn, typename T, KFR_ENABLE_IF(is_callable<FinalFn, T, size_t>::value)>
-CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
+KFR_INTRINSIC auto reduce_call_final(FinalFn&& finalfn, size_t size, T value)
{
return finalfn(value, size);
}
template <typename FinalFn, typename T, KFR_ENABLE_IF(!is_callable<FinalFn, T, size_t>::value)>
-CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
+KFR_INTRINSIC auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
{
return finalfn(value);
}
@@ -65,7 +67,7 @@ CMT_INLINE auto reduce_call_final(FinalFn&& finalfn, size_t, T value)
template <typename T, typename ReduceFn, typename TransformFn, typename FinalFn, KFR_ARCH_DEP>
struct expression_reduce : output_expression
{
- constexpr static size_t width = platform<T>::vector_width * bitness_const(1, 2);
+ constexpr static size_t width = vector_width<T> * bitness_const(1, 2);
using value_type = T;
@@ -76,26 +78,29 @@ struct expression_reduce : output_expression
}
template <size_t N>
- CMT_INLINE void operator()(coutput_t, size_t, const vec<T, N>& x) const
+ KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<T, N>& x) const
{
counter += N;
process(x);
}
- CMT_INLINE T get() { return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn)); }
+ KFR_MEM_INTRINSIC T get()
+ {
+ return internal::reduce_call_final(finalfn, counter, horizontal(value, reducefn));
+ }
protected:
void reset() { counter = 0; }
- CMT_INLINE void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); }
+ KFR_MEM_INTRINSIC void process(const vec<T, width>& x) const { value = reducefn(transformfn(x), value); }
template <size_t N, KFR_ENABLE_IF(N < width)>
- CMT_INLINE void process(const vec<T, N>& x) const
+ KFR_MEM_INTRINSIC void process(const vec<T, N>& x) const
{
value = combine(value, reducefn(transformfn(x), narrow<N>(value)));
}
template <size_t N, KFR_ENABLE_IF(N > width)>
- CMT_INLINE void process(const vec<T, N>& x) const
+ KFR_MEM_INTRINSIC void process(const vec<T, N>& x) const
{
process(low(x));
process(high(x));
@@ -109,10 +114,11 @@ protected:
};
} // namespace internal
-template <typename ReduceFn, typename TransformFn = fn::pass_through, typename FinalFn = fn::pass_through,
- typename E1, typename T = value_type_of<E1>>
-KFR_SINTRIN T reduce(const E1& e1, ReduceFn&& reducefn, TransformFn&& transformfn = fn::pass_through(),
- FinalFn&& finalfn = fn::pass_through())
+template <typename ReduceFn, typename TransformFn = fn_generic::pass_through,
+ typename FinalFn = fn_generic::pass_through, typename E1, typename T = value_type_of<E1>>
+KFR_INTRINSIC T reduce(const E1& e1, ReduceFn&& reducefn,
+ TransformFn&& transformfn = fn_generic::pass_through(),
+ FinalFn&& finalfn = fn_generic::pass_through())
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
using reducer_t = internal::expression_reduce<T, decay<ReduceFn>, decay<TransformFn>, decay<FinalFn>>;
@@ -134,7 +140,7 @@ KFR_FN(reduce)
* \f]
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T sum(const E1& x)
+KFR_INTRINSIC T sum(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::add());
@@ -149,10 +155,10 @@ KFR_SINTRIN T sum(const E1& x)
* \f]
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T mean(const E1& x)
+KFR_INTRINSIC T mean(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
- return reduce(x, fn::add(), fn::pass_through(), fn::final_mean());
+ return reduce(x, fn::add(), fn_generic::pass_through(), fn::final_mean());
}
/**
@@ -161,7 +167,7 @@ KFR_SINTRIN T mean(const E1& x)
* x must have its size and type specified.
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T minof(const E1& x)
+KFR_INTRINSIC T minof(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::min());
@@ -173,7 +179,7 @@ KFR_SINTRIN T minof(const E1& x)
* x must have its size and type specified.
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T maxof(const E1& x)
+KFR_INTRINSIC T maxof(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::max());
@@ -185,7 +191,7 @@ KFR_SINTRIN T maxof(const E1& x)
* x must have its size and type specified.
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T absminof(const E1& x)
+KFR_INTRINSIC T absminof(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::absmin());
@@ -197,7 +203,7 @@ KFR_SINTRIN T absminof(const E1& x)
* x must have its size and type specified.
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T absmaxof(const E1& x)
+KFR_INTRINSIC T absmaxof(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::absmax());
@@ -214,7 +220,7 @@ KFR_SINTRIN T absmaxof(const E1& x)
template <typename E1, typename E2,
typename T = value_type_of<decltype(std::declval<E1>() * std::declval<E2>())>,
KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
+KFR_INTRINSIC T dotproduct(E1&& x, E2&& y)
{
auto m = std::forward<E1>(x) * std::forward<E2>(y);
using E12 = decltype(m);
@@ -231,7 +237,7 @@ KFR_SINTRIN T dotproduct(E1&& x, E2&& y)
\f]
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T rms(const E1& x)
+KFR_INTRINSIC T rms(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::add(), fn::sqr(), fn::final_rootmean());
@@ -246,7 +252,7 @@ KFR_SINTRIN T rms(const E1& x)
\f]
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T sumsqr(const E1& x)
+KFR_INTRINSIC T sumsqr(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::add(), fn::sqr());
@@ -261,9 +267,11 @@ KFR_SINTRIN T sumsqr(const E1& x)
\f]
*/
template <typename E1, typename T = value_type_of<E1>, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_SINTRIN T product(const E1& x)
+KFR_INTRINSIC T product(const E1& x)
{
static_assert(!is_infinite<E1>::value, "e1 must be a sized expression (use slice())");
return reduce(x, fn::mul());
}
+
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -1,158 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/round.hpp"
-
-namespace kfr
-{
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 floor(const T1& x)
-{
- return intrinsics::floor(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::floor, E1> floor(E1&& x)
-{
- return { fn::floor(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 ceil(const T1& x)
-{
- return intrinsics::ceil(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::ceil, E1> ceil(E1&& x)
-{
- return { fn::ceil(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 round(const T1& x)
-{
- return intrinsics::round(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::round, E1> round(E1&& x)
-{
- return { fn::round(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 trunc(const T1& x)
-{
- return intrinsics::trunc(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::trunc, E1> trunc(E1&& x)
-{
- return { fn::trunc(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 fract(const T1& x)
-{
- return intrinsics::fract(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::fract, E1> fract(E1&& x)
-{
- return { fn::fract(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> ifloor(const T1& x)
-{
- return intrinsics::ifloor(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::ifloor, E1> ifloor(E1&& x)
-{
- return { fn::ifloor(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> iceil(const T1& x)
-{
- return intrinsics::iceil(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::iceil, E1> iceil(E1&& x)
-{
- return { fn::iceil(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> iround(const T1& x)
-{
- return intrinsics::iround(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::iround, E1> iround(E1&& x)
-{
- return { fn::iround(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN itype<T1> itrunc(const T1& x)
-{
- return intrinsics::itrunc(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::itrunc, E1> itrunc(E1&& x)
-{
- return { fn::itrunc(), std::forward<E1>(x) };
-}
-
-template <typename T, KFR_ENABLE_IF(is_f_class<T>::value)>
-CMT_INLINE T fmod(const T& x, const T& y)
-{
- return x - trunc(x / y) * y;
-}
-KFR_FN(fmod)
-
-template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-constexpr CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x % y;
-}
-template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-CMT_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
-{
- return fmod(x, y);
-}
-} // namespace kfr
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -1,62 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/saturation.hpp"
-
-namespace kfr
-{
-
-/// @brief Adds two arguments using saturation
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout satadd(const T1& x, const T2& y)
-{
- return intrinsics::satadd(x, y);
-}
-
-/// @brief Creates an expression that adds two arguments using saturation
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::satadd, E1, E2> satadd(E1&& x, E2&& y)
-{
- return { fn::satadd(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-/// @brief Subtracts two arguments using saturation
-template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
- typename Tout = common_type<T1, T2>>
-KFR_INTRIN Tout satsub(const T1& x, const T2& y)
-{
- return intrinsics::satsub(x, y);
-}
-
-/// @brief Creates an expression that subtracts two arguments using saturation
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-KFR_INTRIN internal::expression_function<fn::satsub, E1, E2> satsub(E1&& x, E2&& y)
-{
- return { fn::satsub(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -1,57 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/select.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns x if m is true, otherwise return y. Order of the arguments is same as in ternary operator.
- * @code
- * return m ? x : y
- * @endcode
- */
-template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
- typename Tout = subtype<common_type<T2, T3>>>
-KFR_INTRIN vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
-{
- static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
- return intrinsics::select(bitcast<Tout>(m.asvec()).asmask(), static_cast<vec<Tout, N>>(x),
- static_cast<vec<Tout, N>>(y));
-}
-
-/**
- * @brief Returns template expression that returns x if m is true, otherwise return y. Order of the arguments
- * is same as in ternary operator.
- */
-template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
-KFR_INTRIN internal::expression_function<fn::select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y)
-{
- return { fn::select(), std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/shuffle.hpp b/include/kfr/base/shuffle.hpp
@@ -1,625 +0,0 @@
-/** @addtogroup shuffle
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "constants.hpp"
-#include "expression.hpp"
-#include "types.hpp"
-#include "vec.hpp"
-
-#include <utility>
-
-namespace kfr
-{
-
-namespace internal
-{
-
-template <typename T, typename... Ts, size_t... indices, size_t Nin = sizeof...(Ts),
- size_t Nout = sizeof...(indices)>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, Nout> broadcast_helper(csizes_t<indices...>, const Ts&... values)
-{
- const std::tuple<Ts...> tup(values...);
- return vec<T, Nout>(std::get<indices % Nin>(tup)...);
-}
-} // namespace internal
-
-template <size_t Nout, typename... Ts, typename C = typename std::common_type<Ts...>::type>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<C, Nout> broadcast(const Ts&... values)
-{
- return internal::broadcast_helper<C>(csizeseq_t<Nout>(), values...);
-}
-KFR_FN(broadcast)
-
-template <size_t Ncount, typename T, size_t N>
-CMT_INLINE vec<T, N + Ncount> padhigh(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<N + Ncount>());
-}
-KFR_FN(padhigh)
-
-template <size_t Ncount, typename T, size_t N>
-CMT_INLINE vec<T, N + Ncount> padlow(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<N + Ncount, 0 - Ncount>());
-}
-KFR_FN(padlow)
-
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)>
-CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<Nout>());
-}
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)>
-constexpr CMT_INLINE vec<T, Nout> extend(const vec<T, N>& x)
-{
- return x;
-}
-KFR_FN(extend)
-
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, count> slice(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<count, start>());
-}
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x.shuffle(y, csizeseq_t<count, start>());
-}
-KFR_FN(slice)
-
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, N> replace(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x.shuffle(
- y, csizeseq_t<N>() +
- (csizeseq_t<N>() >= csize_t<start>() && csizeseq_t<N>() < csize_t<start + count>()) * N);
-}
-KFR_FN(replace)
-
-template <size_t, typename T, size_t N>
-CMT_INLINE void split(const vec<T, N>&)
-{
-}
-template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args>
-CMT_INLINE void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
-{
- out = x.shuffle(csizeseq_t<Nout, start>());
- split<start + Nout>(x, std::forward<Args>(args)...);
-}
-template <typename T, size_t N>
-CMT_INLINE void split(const vec<T, N>& x, vec<T, N / 2>& low, vec<T, N / 2>& high)
-{
- low = x.shuffle(csizeseq_t<N / 2, 0>());
- high = x.shuffle(csizeseq_t<N / 2, N / 2>());
-}
-template <typename T, size_t N>
-CMT_INLINE void split(const vec<T, N>& x, vec<T, N / 4>& w0, vec<T, N / 4>& w1, vec<T, N / 4>& w2,
- vec<T, N / 4>& w3)
-{
- w0 = x.shuffle(csizeseq_t<N / 4, 0>());
- w1 = x.shuffle(csizeseq_t<N / 4, N / 4>());
- w2 = x.shuffle(csizeseq_t<N / 4, 2 * N / 4>());
- w3 = x.shuffle(csizeseq_t<N / 4, 3 * N / 4>());
-}
-KFR_FN(split)
-
-template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total>
-CMT_INLINE vec<T, Nout> part(const vec<T, N>& x)
-{
- static_assert(N % total == 0, "N % total == 0");
- return x.shuffle(csizeseq_t<Nout, number * Nout>());
-}
-KFR_FN(part)
-
-template <size_t start, size_t count, typename T, size_t N>
-CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x.shuffle(y, csizeseq_t<count, start>());
-}
-
-template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
-CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<count, start>());
-}
-
-template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
-CMT_INLINE vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- return x.shuffle(csizeseq_t<N2, -(N2 - N1)>())
- .shuffle(y, csizeseq_t<N2 * 2>())
- .shuffle(csizeseq_t<count, N2 - N1 + start>());
-}
-
-KFR_FN(concat_and_slice)
-
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout > N)>
-CMT_INLINE vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T())
-{
- static_assert(Nout > N, "Nout > N");
- return concat(x, broadcast<Nout - N>(newvalue));
-}
-template <size_t Nout, typename T, typename TS>
-constexpr CMT_INLINE vec<T, Nout> widen(const vec<T, Nout>& x, TS)
-{
- return x;
-}
-KFR_FN(widen)
-
-template <size_t Nout, typename T, size_t N>
-CMT_INLINE vec<T, Nout> narrow(const vec<T, N>& x)
-{
- static_assert(Nout <= N, "Nout <= N");
- return slice<0, Nout>(x);
-}
-KFR_FN(narrow)
-
-template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
-CMT_INLINE vec<T, Nout> even(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<Nout / group, 0, 2>()));
-}
-KFR_FN(even)
-
-template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
-CMT_INLINE vec<T, Nout> odd(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<Nout / group, 1, 2>()));
-}
-KFR_FN(odd)
-
-namespace internal
-{
-template <size_t groupsize = 2>
-struct shuffle_index_dup1
-{
- constexpr inline size_t operator()(size_t index) const { return index / groupsize; }
-};
-
-template <size_t groupsize = 2, size_t start = 0>
-struct shuffle_index_dup
-{
- constexpr inline size_t operator()(size_t index) const { return start + index / groupsize * groupsize; }
-};
-} // namespace internal
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> dupeven(const vec<T, N>& x)
-{
- static_assert(N % 2 == 0, "N must be even");
- return x.shuffle(csizeseq_t<N, 0, 1>() & ~csize_t<1>());
-}
-KFR_FN(dupeven)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> dupodd(const vec<T, N>& x)
-{
- static_assert(N % 2 == 0, "N must be even");
- return x.shuffle(csizeseq_t<N, 0, 1>() | csize_t<1>());
-}
-KFR_FN(dupodd)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N * 2> duphalfs(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<N * 2>() % csize_t<N>());
-}
-KFR_FN(duphalfs)
-
-namespace internal
-{
-template <size_t size, size_t... Indices>
-struct shuffle_index_shuffle
-{
- constexpr static size_t indexcount = sizeof...(Indices);
-
- template <size_t index>
- constexpr inline size_t operator()(csize_t<index>) const
- {
- return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount;
- }
-};
-} // namespace internal
-
-template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
- elements_t<Indices...> i = elements_t<Indices...>())
-{
- return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] +
- csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
-}
-KFR_FN(shuffle)
-
-template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y,
- elements_t<Indices...> i = elements_t<Indices...>())
-{
- return x.shuffle(y, scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
- csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
-}
-KFR_FN(shufflegroups)
-
-namespace internal
-{
-template <size_t size, size_t... Indices>
-struct shuffle_index_permute
-{
- constexpr static size_t indexcount = sizeof...(Indices);
-
- template <size_t index>
- constexpr inline size_t operator()(csize_t<index>) const
- {
- return csizes_t<Indices...>::get(csize_t<index % indexcount>()) + index / indexcount * indexcount;
- }
-};
-} // namespace internal
-
-template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
-{
- return x.shuffle(i[csizeseq_t<N>() % csize_t<count>()] +
- csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
-}
-KFR_FN(permute)
-
-template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
-CMT_INLINE vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
-{
- return x.shuffle(scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
- csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
-}
-KFR_FN(permutegroups)
-
-namespace internal
-{
-
-template <typename T, size_t Nout, typename Fn, size_t... Indices>
-constexpr CMT_INLINE vec<T, Nout> generate_vector(csizes_t<Indices...>)
-{
- return make_vector(static_cast<T>(Fn()(Indices))...);
-}
-} // namespace internal
-
-template <typename T, size_t Nout, typename Fn>
-constexpr CMT_INLINE vec<T, Nout> generate_vector()
-{
- return internal::generate_vector<T, Nout, Fn>(cvalseq_t<size_t, Nout>());
-}
-KFR_FN(generate_vector)
-
-namespace internal
-{
-template <typename T, size_t N>
-constexpr CMT_INLINE mask<T, N> evenmask()
-{
- return broadcast<N, T>(maskbits<T>(true), maskbits<T>(false));
-}
-template <typename T, size_t N>
-constexpr CMT_INLINE mask<T, N> oddmask()
-{
- return broadcast<N, T>(maskbits<T>(false), maskbits<T>(true));
-}
-} // namespace internal
-
-template <typename T, size_t N, size_t Nout = N * 2>
-CMT_INLINE vec<T, Nout> dup(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<Nout>() / csize_t<2>());
-}
-KFR_FN(dup)
-
-namespace internal
-{
-template <size_t count, size_t start = 0>
-struct shuffle_index_duphalf
-{
- constexpr inline size_t operator()(size_t index) const { return start + (index) % count; }
-};
-} // namespace internal
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> duplow(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>());
-}
-KFR_FN(duplow)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> duphigh(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>() + csize_t<N - N / 2>());
-}
-KFR_FN(duphigh)
-
-namespace internal
-{
-template <size_t size, size_t... Indices>
-struct shuffle_index_blend
-{
- constexpr static size_t indexcount = sizeof...(Indices);
-
- template <size_t index>
- constexpr inline size_t operator()(csize_t<index>) const
- {
- return (elements_t<Indices...>::get(csize_t<index % indexcount>()) ? size : 0) + index % size;
- }
-};
-} // namespace internal
-
-template <size_t... Indices, typename T, size_t N>
-CMT_INLINE vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y,
- elements_t<Indices...> i = elements_t<Indices...>())
-{
- return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] * csize_t<N>() + csizeseq_t<N>());
-}
-KFR_FN(blend)
-
-namespace internal
-{
-template <size_t elements>
-struct shuffle_index_swap
-{
- constexpr inline size_t operator()(size_t index) const
- {
- static_assert(is_poweroftwo(elements), "is_poweroftwo( elements )");
- return index ^ (elements - 1);
- }
-};
-template <size_t amount, size_t N>
-struct shuffle_index_outputright
-{
- constexpr inline size_t operator()(size_t index) const
- {
- return index < N - amount ? index : index + amount;
- }
-};
-} // namespace internal
-
-template <size_t elements = 2, typename T, size_t N>
-CMT_INLINE vec<T, N> swap(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<N>() ^ csize_t<elements - 1>());
-}
-CMT_FN_TPL((size_t elements), (elements), swap)
-
-template <size_t shift, typename T, size_t N>
-CMT_INLINE vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi)
-{
- return shift == 0 ? lo : (shift == N ? hi : hi.shuffle(lo, csizeseq_t<N, N - shift>()));
-}
-
-template <size_t amount, typename T, size_t N>
-CMT_INLINE vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
-{
- static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
- return x.shuffle(csizeseq_t<N, N - amount>() % csize_t<N>());
-}
-KFR_FN(rotateright)
-
-template <size_t amount, typename T, size_t N>
-CMT_INLINE vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
-{
- static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
- return x.shuffle(csizeseq_t<N, amount>() % csize_t<N>());
-}
-KFR_FN(rotateleft)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> insertright(T x, const vec<T, N>& y)
-{
- return concat_and_slice<1, N>(y, vec<T, 1>(x));
-}
-KFR_FN(insertright)
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> insertleft(T x, const vec<T, N>& y)
-{
- return concat_and_slice<0, N>(vec<T, 1>(x), y);
-}
-KFR_FN(insertleft)
-
-// template <typename T, size_t N, size_t N2>
-// CMT_INLINE vec<T, N> outputright(const vec<T, N>& x, const vec<T, N2>& y)
-//{
-// return shufflevector<N, internal::shuffle_index_outputright<N2, N>>(x, extend<N>(y));
-//}
-// KFR_FN(outputright)
-
-namespace internal
-{
-template <size_t size, size_t side1>
-struct shuffle_index_transpose
-{
- constexpr inline size_t operator()(size_t index) const
- {
- return index % (size / side1) * side1 + index / (size / side1);
- }
-};
-} // namespace internal
-
-template <size_t side1, size_t group = 1, typename T, size_t N, size_t size = N / group,
- size_t side2 = size / side1, KFR_ENABLE_IF(size > 3)>
-CMT_INLINE vec<T, N> transpose(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
- csizeseq_t<size>() / csize_t<side2>()));
-}
-template <size_t side, size_t group = 1, typename T, size_t N, KFR_ENABLE_IF(N / group <= 3)>
-CMT_INLINE vec<T, N> transpose(const vec<T, N>& x)
-{
- return x;
-}
-template <typename T, size_t N>
-CMT_INLINE vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x)
-{
- return vec<vec<T, N>, N>(transpose<2>(x.flatten()));
-}
-KFR_FN(transpose)
-
-template <size_t side2, size_t group = 1, typename T, size_t N, size_t size = N / group,
- size_t side1 = size / side2, KFR_ENABLE_IF(size > 3)>
-CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
- csizeseq_t<size>() / csize_t<side2>()));
-}
-template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)>
-CMT_INLINE vec<T, N> transposeinverse(const vec<T, N>& x)
-{
- return x;
-}
-KFR_FN(transposeinverse)
-
-template <size_t side, typename T, size_t N>
-CMT_INLINE vec<T, N> ctranspose(const vec<T, N>& x)
-{
- return transpose<side, 2>(x);
-}
-KFR_FN(ctranspose)
-
-template <size_t side, typename T, size_t N>
-CMT_INLINE vec<T, N> ctransposeinverse(const vec<T, N>& x)
-{
- return transposeinverse<side, 2>(x);
-}
-KFR_FN(ctransposeinverse)
-
-template <size_t group = 1, typename T, size_t N, size_t Nout = N * 2, size_t size = Nout / group,
- size_t side2 = 2, size_t side1 = size / side2>
-CMT_INLINE vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x.shuffle(y, scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
- csizeseq_t<size>() / csize_t<side2>()));
-}
-KFR_FN(interleave)
-
-template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
-CMT_INLINE internal::expression_function<fn::interleave, E1, E2> interleave(E1&& x, E2&& y)
-{
- return { fn::interleave(), std::forward<E1>(x), std::forward<E2>(y) };
-}
-
-template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side2 = 2,
- size_t side1 = size / side2>
-CMT_INLINE vec<T, N> interleavehalfs(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
- csizeseq_t<size>() / csize_t<side2>()));
-}
-KFR_FN(interleavehalfs)
-
-template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side1 = 2,
- size_t side2 = size / side1>
-CMT_INLINE vec<T, N> splitpairs(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
- csizeseq_t<size>() / csize_t<side2>()));
-}
-KFR_FN(splitpairs)
-
-namespace internal
-{
-template <size_t size>
-struct shuffle_index_reverse
-{
- constexpr inline size_t operator()(size_t index) const { return size - 1 - index; }
-};
-} // namespace internal
-
-template <size_t group = 1, typename T, size_t N, size_t size = N / group>
-CMT_INLINE vec<T, N> reverse(const vec<T, N>& x)
-{
- return x.shuffle(scale<group>(csizeseq_t<size, size - 1, -1>()));
-}
-template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x)
-{
- return vec<vec<T, N1>, N2>(swap<N1>(x.flatten()));
-}
-KFR_FN(reverse)
-
-namespace internal
-{
-template <size_t N1, size_t N2>
-struct shuffle_index_combine
-{
- constexpr inline size_t operator()(size_t index) const { return index >= N2 ? index : N1 + index; }
-};
-} // namespace internal
-
-template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- static_assert(N2 <= N1, "N2 <= N1");
- return x.shuffle(extend<N1>(y), (csizeseq_t<N1>() < csize_t<N2>()) * csize_t<N1>() + csizeseq_t<N1>());
- // return shufflevector<N1, internal::shuffle_index_combine<N1, N2>>(x, extend<N1>(y));
-}
-KFR_FN(combine)
-
-namespace internal
-{
-template <size_t start, size_t stride>
-struct generate_index
-{
- CMT_INLINE constexpr size_t operator()(size_t index) const { return start + index * stride; }
-};
-template <size_t start, size_t size, int on, int off>
-struct generate_onoff
-{
- CMT_INLINE constexpr size_t operator()(size_t index) const
- {
- return index >= start && index < start + size ? on : off;
- }
-};
-} // namespace internal
-
-template <typename T, size_t N, size_t start = 0, size_t stride = 1>
-constexpr CMT_INLINE vec<T, N> enumerate()
-{
- return generate_vector<T, N, internal::generate_index<start, stride>>();
-}
-template <size_t start = 0, size_t stride = 1, typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> enumerate(vec_t<T, N>)
-{
- return generate_vector<T, N, internal::generate_index<start, stride>>();
-}
-KFR_FN(enumerate)
-
-template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0>
-constexpr CMT_INLINE vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
-{
- return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
-}
-template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> onoff(vec_t<T, N>, cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
-{
- return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
-}
-KFR_FN(onoff)
-} // namespace kfr
-#define KFR_SHUFFLE_SPECIALIZATIONS 1
-#include "specializations.i"
diff --git a/include/kfr/base/simd_clang.hpp b/include/kfr/base/simd_clang.hpp
@@ -1,350 +0,0 @@
-/** @addtogroup types
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "kfr.h"
-#include "platform.hpp"
-#include "types.hpp"
-
-#if CMT_COMPILER_CLANG
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4324))
-
-namespace kfr
-{
-
-template <typename T, size_t... Ns>
-constexpr vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept;
-
-#define KFR_NATIVE_INTRINSICS 1
-
-namespace internal
-{
-template <typename TT, size_t NN>
-using simd_type = TT __attribute__((ext_vector_type(NN)));
-
-template <typename T, size_t N, bool A>
-using simd_storage = internal::struct_with_alignment<simd_type<T, N>, A>;
-
-template <typename T, size_t N, size_t... indices>
-CMT_INLINE simd_type<T, sizeof...(indices)> simd_shuffle(const simd_type<T, N>& x, const simd_type<T, N>& y,
- csizes_t<indices...>)
-{
- return __builtin_shufflevector(x, y, ((indices >= N * 2) ? -1 : static_cast<int>(indices))...);
-}
-template <typename T, size_t N, size_t... indices>
-CMT_INLINE simd_type<T, sizeof...(indices)> simd_shuffle(const simd_type<T, N>& x, csizes_t<indices...>)
-{
- return __builtin_shufflevector(x, x, ((indices >= N) ? -1 : static_cast<int>(indices))...);
-}
-
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-CMT_INLINE simd_type<T, N> simd_read(const T* src)
-{
- return ptr_cast<simd_storage<T, N, A>>(src)->value;
-}
-
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-CMT_INLINE simd_type<T, N> simd_read(const T* src)
-{
- constexpr size_t first = prev_poweroftwo(N);
- constexpr size_t rest = N - first;
- constexpr auto extend_indices =
- cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
- constexpr auto concat_indices = cvalseq_t<size_t, N>();
- return simd_shuffle<T, first>(simd_read<first, A>(src),
- simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices),
- concat_indices);
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-CMT_INLINE void simd_write(T* dest, const simd_type<T, N>& value)
-{
- ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-CMT_INLINE void simd_write(T* dest, const simd_type<T, N>& value)
-{
- constexpr size_t first = prev_poweroftwo(N);
- constexpr size_t rest = N - first;
- simd_write<A, first>(dest, simd_shuffle(value, csizeseq_t<first>()));
- simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq_t<rest, first>()));
-}
-} // namespace internal
-
-template <typename T, size_t N>
-struct alignas(alignof(internal::simd_type<T, N>)) vec : public vec_t<T, N>
-{
- static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
-
- // type and size
- using value_type = T;
- constexpr static size_t size() noexcept { return N; }
-
- using scalar_type = T;
- constexpr static size_t scalar_size() noexcept { return N; }
-
- using mask_t = mask<T, N>;
-
- using simd_type = internal::simd_type<T, N>;
- using uvalue_type = utype<T>;
- using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>;
- using usimd_type = internal::simd_type<uvalue_type, N>;
- using iusimd_type = internal::simd_type<iuvalue_type, N>;
-
- // constructors and assignment
- // default
- constexpr vec() noexcept = default;
- // copy
- constexpr vec(const vec&) noexcept = default;
- // assignment
- constexpr vec& operator=(const vec&) noexcept = default;
- // from scalar
- template <typename U, typename = enable_if<(std::is_convertible<U, value_type>::value)>>
- constexpr vec(const U& s) noexcept : simd(s)
- {
- }
- // from list
- template <typename... Us>
- constexpr vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
- : simd{ s0, s1, static_cast<value_type>(rest)... }
- {
- }
- // from vector of another type
- template <typename U, typename = enable_if<is_simd_type<U>::value>>
- constexpr vec(const vec<U, N>& v) noexcept : simd(__builtin_convertvector(v.simd, simd_type))
- {
- }
- constexpr vec(const simd_type& simd) noexcept : simd(simd) {}
- // from list of vectors
- template <size_t... Ns, typename = enable_if<csum<size_t, Ns...>() == N>>
- constexpr vec(const vec<T, Ns>&... vs) noexcept : simd(*concat(vs...))
- {
- }
- constexpr vec(czeros_t) noexcept : simd(0) {}
- constexpr vec(cones_t) noexcept : simd(*(vec() == vec())) {}
-
- template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
- constexpr static vec frombits(const vec<U, M>& v) noexcept
- {
- return (simd_type)(v.flatten().simd);
- }
-
-#define KFR_U(x) ((usimd_type)(x))
-#define KFR_IU(x) ((iusimd_type)(x))
-#define KFR_S(x) ((simd_type)(x))
-
- // math / bitwise / comparison operators
- constexpr friend vec operator+(const vec& x) noexcept { return x; }
- constexpr friend vec operator-(const vec& x) noexcept { return KFR_S(-*x); }
- constexpr friend vec operator~(const vec& x) noexcept { return KFR_S(~KFR_U(*x)); }
-
- constexpr friend vec operator+(const vec& x, const vec& y) noexcept { return *x + *y; }
- constexpr friend vec operator-(const vec& x, const vec& y) noexcept { return *x - *y; }
- constexpr friend vec operator*(const vec& x, const vec& y) noexcept { return *x * *y; }
- constexpr friend vec operator/(const vec& x, const vec& y) noexcept { return *x / *y; }
-
- constexpr friend vec operator<<(const vec& x, int shift) noexcept { return KFR_S(KFR_IU(*x) << shift); }
- constexpr friend vec operator>>(const vec& x, int shift) noexcept { return KFR_S(KFR_IU(*x) >> shift); }
- constexpr friend vec operator&(const vec& x, const vec& y) noexcept
- {
- return KFR_S(KFR_U(*x) & KFR_U(*y));
- }
- constexpr friend vec operator|(const vec& x, const vec& y) noexcept
- {
- return KFR_S(KFR_U(*x) | KFR_U(*y));
- }
- constexpr friend vec operator^(const vec& x, const vec& y) noexcept
- {
- return KFR_S(KFR_U(*x) ^ KFR_U(*y));
- }
-
- constexpr friend mask_t operator==(const vec& x, const vec& y) noexcept { return KFR_S(*x == *y); }
- constexpr friend mask_t operator!=(const vec& x, const vec& y) noexcept { return KFR_S(*x != *y); }
- constexpr friend mask_t operator<(const vec& x, const vec& y) noexcept { return KFR_S(*x < *y); }
- constexpr friend mask_t operator>(const vec& x, const vec& y) noexcept { return KFR_S(*x > *y); }
- constexpr friend mask_t operator<=(const vec& x, const vec& y) noexcept { return KFR_S(*x <= *y); }
- constexpr friend mask_t operator>=(const vec& x, const vec& y) noexcept { return KFR_S(*x >= *y); }
-
- constexpr mask_t asmask() const noexcept { return mask_t(simd); }
-
-#undef KFR_S
-#undef KFR_U
-
- constexpr friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; }
- constexpr friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; }
- constexpr friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; }
- constexpr friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; }
-
- constexpr friend vec& operator<<=(vec& x, int shift) noexcept { return x = x << shift; }
- constexpr friend vec& operator>>=(vec& x, int shift) noexcept { return x = x >> shift; }
- constexpr friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; }
- constexpr friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; }
- constexpr friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; }
-
- constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
- constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
- constexpr friend vec operator++(vec& x, int) noexcept
- {
- const vec z = x;
- ++x;
- return z;
- }
- constexpr friend vec operator--(vec& x, int) noexcept
- {
- const vec z = x;
- --x;
- return z;
- }
-
- // shuffle
- template <size_t... indices>
- vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
- {
- return __builtin_shufflevector(simd, simd, (indices >= N ? -1 : int(indices))...);
- }
- template <size_t... indices>
- vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
- {
- return __builtin_shufflevector(simd, y.simd, (indices >= N * 2 ? -1 : int(indices))...);
- }
-
- // element access
- struct element;
- constexpr value_type operator[](size_t index) const& noexcept { return get(index); }
- constexpr value_type operator[](size_t index) && noexcept { return get(index); }
- constexpr element operator[](size_t index) & noexcept { return { *this, index }; }
-
- constexpr value_type get(size_t index) const noexcept { return simd[index]; }
- constexpr void set(size_t index, const value_type& s) noexcept { simd[index] = s; }
- template <size_t index>
- constexpr value_type get(csize_t<index>) const noexcept
- {
- return simd[index];
- }
- template <size_t index>
- constexpr void set(csize_t<index>, const value_type& s) noexcept
- {
- simd[index] = s;
- }
- struct element
- {
- constexpr operator value_type() const noexcept { return v.get(index); }
- element& operator=(const value_type& s) noexcept
- {
- v.set(index, s);
- return *this;
- }
- element& operator=(const element& s) noexcept
- {
- v.set(index, static_cast<value_type>(s));
- return *this;
- }
- template <typename U, size_t M>
- element& operator=(const typename vec<U, M>::element& s) noexcept
- {
- v.set(index, static_cast<value_type>(static_cast<U>(s)));
- return *this;
- }
- vec& v;
- size_t index;
- };
-
- // read/write
- template <bool aligned = false>
- explicit constexpr vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept
- : simd(internal::simd_read<N, aligned>(src))
- {
- }
- template <bool aligned = false>
- const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept
- {
- internal::simd_write<aligned, N>(dest, simd);
- return *this;
- }
-
- // native SIMD type access
- const vec& flatten() const noexcept { return *this; }
- simd_type operator*() const noexcept { return simd; }
- simd_type& operator*() noexcept { return simd; }
-
-protected:
- template <typename U, size_t M>
- friend struct vec;
-
- simd_type simd;
-
-private:
-};
-
-namespace internal
-{
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x)
-{
- return x;
-}
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N * 2> concat_impl(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x.shuffle(y, csizeseq_t<N * 2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<N1 + N2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- return x.shuffle(csizeseq_t<N2, -(N2 - N1)>())
- .shuffle(y, csizeseq_t<N2 * 2>())
- .shuffle(csizeseq_t<N1 + N2, N2 - N1>());
-}
-
-template <typename T, size_t N1, size_t N2, size_t... Sizes>
-CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y,
- const vec<T, Sizes>&... args)
-{
- return concat_impl(concat_impl(x, y), args...);
-}
-} // namespace internal
-
-template <typename T, size_t... Ns>
-constexpr inline vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept
-{
- return internal::concat_impl(vs...);
-}
-} // namespace kfr
-
-CMT_PRAGMA_MSVC(warning(pop))
-
-#endif
diff --git a/include/kfr/base/simd_intrin.hpp b/include/kfr/base/simd_intrin.hpp
@@ -1,392 +0,0 @@
-/** @addtogroup types
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "kfr.h"
-
-#include "constants.hpp"
-#include "platform.hpp"
-#include "types.hpp"
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4324))
-CMT_PRAGMA_MSVC(warning(disable : 4814))
-
-#ifdef CMT_INTRINSICS_IS_CONSTEXPR
-#define KFR_I_CE constexpr
-#else
-#define KFR_I_CE
-#endif
-
-namespace kfr
-{
-
-template <typename T, size_t... Ns>
-constexpr vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept;
-
-#define KFR_NATIVE_INTRINSICS 1
-
-template <typename T, size_t N>
-struct simd_type_holder
-{
- using type = struct
- {
- T v[N];
- };
-};
-
-#define KFR_SIMD_SPEC_TYPE(T, N, MM) \
- template <> \
- struct simd_type_holder<T, N> \
- { \
- using type = MM; \
- };
-
-#ifdef CMT_ARCH_SSE2
-KFR_SIMD_SPEC_TYPE(u8, 16, __m128i);
-KFR_SIMD_SPEC_TYPE(u16, 8, __m128i);
-KFR_SIMD_SPEC_TYPE(u32, 4, __m128i);
-KFR_SIMD_SPEC_TYPE(u64, 2, __m128i);
-KFR_SIMD_SPEC_TYPE(i8, 16, __m128i);
-KFR_SIMD_SPEC_TYPE(i16, 8, __m128i);
-KFR_SIMD_SPEC_TYPE(i32, 4, __m128i);
-KFR_SIMD_SPEC_TYPE(i64, 2, __m128i);
-KFR_SIMD_SPEC_TYPE(f32, 4, __m128);
-KFR_SIMD_SPEC_TYPE(f64, 2, __m128d);
-#endif
-
-#ifdef CMT_ARCH_AVX
-KFR_SIMD_SPEC_TYPE(u8, 32, __m256i);
-KFR_SIMD_SPEC_TYPE(u16, 16, __m256i);
-KFR_SIMD_SPEC_TYPE(u32, 8, __m256i);
-KFR_SIMD_SPEC_TYPE(u64, 4, __m256i);
-KFR_SIMD_SPEC_TYPE(i8, 32, __m256i);
-KFR_SIMD_SPEC_TYPE(i16, 16, __m256i);
-KFR_SIMD_SPEC_TYPE(i32, 8, __m256i);
-KFR_SIMD_SPEC_TYPE(i64, 4, __m256i);
-KFR_SIMD_SPEC_TYPE(f32, 8, __m256);
-KFR_SIMD_SPEC_TYPE(f64, 4, __m256d);
-#endif
-
-#ifdef CMT_ARCH_AVX512
-KFR_SIMD_SPEC_TYPE(u8, 64, __m512i);
-KFR_SIMD_SPEC_TYPE(u16, 32, __m512i);
-KFR_SIMD_SPEC_TYPE(u32, 16, __m512i);
-KFR_SIMD_SPEC_TYPE(u64, 8, __m512i);
-KFR_SIMD_SPEC_TYPE(i8, 64, __m512i);
-KFR_SIMD_SPEC_TYPE(i16, 32, __m512i);
-KFR_SIMD_SPEC_TYPE(i32, 16, __m512i);
-KFR_SIMD_SPEC_TYPE(i64, 8, __m512i);
-KFR_SIMD_SPEC_TYPE(f32, 16, __m512);
-KFR_SIMD_SPEC_TYPE(f64, 8, __m512d);
-#endif
-
-#ifdef CMT_ARCH_NEON
-KFR_SIMD_SPEC_TYPE(u8, 16, uint8x16_t);
-KFR_SIMD_SPEC_TYPE(u16, 8, uint16x8_t);
-KFR_SIMD_SPEC_TYPE(u32, 4, uint32x4_t);
-KFR_SIMD_SPEC_TYPE(u64, 2, uint64x2_t);
-KFR_SIMD_SPEC_TYPE(i8, 16, int8x16_t);
-KFR_SIMD_SPEC_TYPE(i16, 8, int16x8_t);
-KFR_SIMD_SPEC_TYPE(i32, 4, int32x4_t);
-KFR_SIMD_SPEC_TYPE(i64, 2, int64x2_t);
-KFR_SIMD_SPEC_TYPE(f32, 4, float32x4_t);
-#ifdef CMT_ARCH_NEON64
-KFR_SIMD_SPEC_TYPE(f64, 2, float64x2_t);
-#endif
-#endif
-
-template <size_t N>
-struct raw_bytes
-{
- u8 raw[N];
-};
-
-#define KFR_CYCLE(...) \
- for (size_t i = 0; i < N; i++) \
- __VA_ARGS__
-
-#define KFR_C_CYCLE(...) \
- for (size_t i = 0; i < N; i++) \
- vs[i] = __VA_ARGS__
-
-#define KFR_R_CYCLE(...) \
- vec<T, N> result; \
- for (size_t i = 0; i < N; i++) \
- result.vs[i] = __VA_ARGS__; \
- return result
-
-#define KFR_B_CYCLE(...) \
- vec<T, N> result; \
- for (size_t i = 0; i < N; i++) \
- result.vs[i] = (__VA_ARGS__) ? constants<value_type>::allones() : value_type(0); \
- return result.asmask()
-
-template <typename T, size_t N>
-struct alignas(const_min(platform<>::maximum_vector_alignment, sizeof(T) * next_poweroftwo(N))) vec
- : vec_t<T, N>
-{
- constexpr static size_t simd_width = platform<T>::vector_width;
- constexpr static size_t count = (N + simd_width - 1) / simd_width;
-
- static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
-
- // type and size
- using value_type = T;
- constexpr static size_t size() noexcept { return N; }
-
- using scalar_type = T;
- constexpr static size_t scalar_size() noexcept { return N; }
-
- using simd_type = typename simd_type_holder<T, N>::type;
-
- using uvalue_type = utype<T>;
- using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>;
-
- using mask_t = mask<T, N>;
-
- using uvec = vec<uvalue_type, N>;
- using iuvec = vec<iuvalue_type, N>;
-
- // constructors and assignment
- // default
- constexpr vec() noexcept = default;
- // copy
- vec(const vec&) noexcept = default;
- // assignment
- CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default;
-
- template <size_t... indices>
- KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
- {
- return vec<value_type, sizeof...(indices)>((indices < N ? vs[indices % N] : 0)...);
- }
- template <size_t... indices>
- KFR_I_CE vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
- {
- return vec<value_type, sizeof...(indices)>(
- (indices < N ? vs[indices % N] : indices < 2 * N ? y.vs[(indices - N) % N] : 0)...);
- }
-
- template <typename U, typename = enable_if<(std::is_convertible<U, value_type>::value)>>
- KFR_I_CE vec(const U& s) noexcept
- {
- KFR_C_CYCLE(s);
- }
-
- constexpr vec(const simd_type& simd) noexcept : simd(simd) {}
- // from vector of another type
- template <typename U, typename = enable_if<is_simd_type<U>::value>>
- KFR_I_CE vec(const vec<U, N>& v) noexcept
- {
- KFR_C_CYCLE(static_cast<value_type>(v.vs[i]));
- }
- // from list
- template <typename... Us>
- KFR_I_CE vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
- : vs{ s0, s1, static_cast<value_type>(rest)... }
- {
- }
- template <size_t N1, size_t... Ns, typename = enable_if<(csum<size_t, N1, Ns...>() == N)>>
- KFR_I_CE vec(const vec<T, N1>& v0, const vec<T, Ns>&... vecs) noexcept : simd(*concat(v0, vecs...))
- {
- }
-
- KFR_I_CE vec(czeros_t) noexcept { KFR_C_CYCLE(value_type(0)); }
- KFR_I_CE vec(cones_t) noexcept { KFR_C_CYCLE(constants<value_type>::allones()); }
-
- template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
- KFR_I_CE static vec frombits(const vec<U, M>& v) noexcept
- {
- vec r;
- r.bytes = v.flatten().bytes;
- return r;
- }
-
- KFR_I_CE vec operator+() const noexcept { return *this; }
- KFR_I_CE vec operator-() const noexcept { KFR_R_CYCLE(-this->vs[i]); }
- KFR_I_CE vec operator~() const noexcept
- {
- uvec xx = uvec::frombits(*this);
- KFR_CYCLE(xx.vs[i] = ~xx.vs[i]);
- return frombits(xx);
- }
-
- KFR_I_CE vec operator+(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] + y.vs[i]); }
- KFR_I_CE vec operator-(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] - y.vs[i]); }
- KFR_I_CE vec operator*(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] * y.vs[i]); }
- KFR_I_CE vec operator/(const vec& y) const noexcept { KFR_R_CYCLE(this->vs[i] / y.vs[i]); }
-
- KFR_I_CE vec operator<<(int shift) const noexcept
- {
- iuvec xx = iuvec::frombits(*this);
- KFR_CYCLE(xx.vs[i] <<= shift);
- return frombits(xx);
- }
- KFR_I_CE vec operator>>(int shift) const noexcept
- {
- iuvec xx = iuvec::frombits(*this);
- KFR_CYCLE(xx.vs[i] >>= shift);
- return frombits(xx);
- }
- KFR_I_CE vec operator&(const vec& y) const noexcept
- {
- uvec xx = uvec::frombits(*this);
- uvec yy = uvec::frombits(y);
- KFR_CYCLE(xx.vs[i] &= yy.vs[i]);
- return frombits(xx);
- }
- KFR_I_CE vec operator|(const vec& y) const noexcept
- {
- uvec xx = uvec::frombits(*this);
- uvec yy = uvec::frombits(y);
- KFR_CYCLE(xx.vs[i] |= yy.vs[i]);
- return frombits(xx);
- }
- KFR_I_CE vec operator^(const vec& y) const noexcept
- {
- uvec xx = uvec::frombits(*this);
- uvec yy = uvec::frombits(y);
- KFR_CYCLE(xx.vs[i] ^= yy.vs[i]);
- return frombits(xx);
- }
-
- KFR_I_CE mask_t operator==(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] == y.vs[i]); }
- KFR_I_CE mask_t operator!=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] != y.vs[i]); }
- KFR_I_CE mask_t operator<(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] < y.vs[i]); }
- KFR_I_CE mask_t operator>(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] > y.vs[i]); }
- KFR_I_CE mask_t operator<=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] <= y.vs[i]); }
- KFR_I_CE mask_t operator>=(const vec& y) const noexcept { KFR_B_CYCLE(this->vs[i] >= y.vs[i]); }
-
- constexpr mask_t asmask() const noexcept { return mask_t(simd); }
-
- KFR_I_CE vec& operator+=(const vec& y) noexcept { return *this = *this + y; }
- KFR_I_CE vec& operator-=(const vec& y) noexcept { return *this = *this - y; }
- KFR_I_CE vec& operator*=(const vec& y) noexcept { return *this = *this * y; }
- KFR_I_CE vec& operator/=(const vec& y) noexcept { return *this = *this / y; }
- KFR_I_CE vec& operator<<=(int shift) noexcept { return *this = *this << shift; }
- KFR_I_CE vec& operator>>=(int shift) noexcept { return *this = *this >> shift; }
- KFR_I_CE vec& operator&=(const vec& y) noexcept { return *this = *this & y; }
- KFR_I_CE vec& operator|=(const vec& y) noexcept { return *this = *this | y; }
- KFR_I_CE vec& operator^=(const vec& y) noexcept { return *this = *this ^ y; }
-
- KFR_I_CE vec& operator++() noexcept { return *this = *this + vec(1); }
- KFR_I_CE vec& operator--() noexcept { return *this = *this - vec(1); }
- KFR_I_CE vec operator++(int) noexcept
- {
- const vec z = *this;
- ++*this;
- return z;
- }
- KFR_I_CE vec operator--(int) noexcept
- {
- const vec z = *this;
- --*this;
- return z;
- }
-
- explicit KFR_I_CE vec(const value_type* src) { KFR_C_CYCLE(src[i]); }
- explicit KFR_I_CE vec(const value_type* src, cunaligned_t) { KFR_C_CYCLE(src[i]); }
- explicit KFR_I_CE vec(const value_type* src, caligned_t) { KFR_C_CYCLE(src[i]); }
-
- const vec& write(value_type* dest) const
- {
- KFR_CYCLE(dest[i] = vs[i]);
- return *this;
- }
- const vec& write(value_type* dest, cunaligned_t) const
- {
- KFR_CYCLE(dest[i] = vs[i]);
- return *this;
- }
- const vec& write(value_type* dest, caligned_t) const
- {
- KFR_CYCLE(dest[i] = vs[i]);
- return *this;
- }
-
- KFR_I_CE value_type operator[](size_t index) const noexcept { return vs[index]; }
- KFR_I_CE value_type& operator[](size_t index) noexcept { return vs[index]; }
-
- const vec& flatten() const noexcept { return *this; }
- simd_type operator*() const noexcept { return simd; }
- simd_type& operator*() noexcept { return simd; }
-
-protected:
- template <typename, size_t>
- friend struct vec;
-
- union {
- T vs[N];
- simd_type simd;
- raw_bytes<N * sizeof(T)> bytes;
- };
-};
-
-namespace internal
-{
-template <typename T, size_t N>
-CMT_INLINE vec<T, N> concat_impl(const vec<T, N>& x)
-{
- return x;
-}
-
-template <typename T, size_t N>
-CMT_INLINE vec<T, N * 2> concat_impl(const vec<T, N>& x, const vec<T, N>& y)
-{
- return x.shuffle(y, csizeseq_t<N * 2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- return x.shuffle(y.shuffle(csizeseq_t<N1>()), csizeseq_t<N1 * 2>()).shuffle(csizeseq_t<N1 + N2>());
-}
-
-template <typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
-CMT_INLINE vec<T, N1 + N2> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y)
-{
- return x.shuffle(csizeseq_t<N2, -(N2 - N1)>())
- .shuffle(y, csizeseq_t<N2 * 2>())
- .shuffle(csizeseq_t<N1 + N2, N2 - N1>());
-}
-
-template <typename T, size_t N1, size_t N2, size_t... Sizes>
-CMT_INLINE vec<T, csum<size_t, N1, N2, Sizes...>()> concat_impl(const vec<T, N1>& x, const vec<T, N2>& y,
- const vec<T, Sizes>&... args)
-{
- return concat_impl(concat_impl(x, y), args...);
-}
-} // namespace internal
-
-template <typename T, size_t... Ns>
-constexpr inline vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) noexcept
-{
- return internal::concat_impl(vs...);
-}
-} // namespace kfr
-
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/simd_x86.hpp b/include/kfr/base/simd_x86.hpp
@@ -1,272 +0,0 @@
-#pragma once
-
-#include "constants.hpp"
-#include "platform.hpp"
-#include "simd_intrin.hpp"
-namespace kfr
-{
-#ifdef CMT_ARCH_SSE2
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator+(const vec<f32, 4>& y) const noexcept
-{
- return _mm_add_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator-(const vec<f32, 4>& y) const noexcept
-{
- return _mm_sub_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator*(const vec<f32, 4>& y) const noexcept
-{
- return _mm_mul_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator/(const vec<f32, 4>& y) const noexcept
-{
- return _mm_div_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator&(const vec<f32, 4>& y) const noexcept
-{
- return _mm_and_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator|(const vec<f32, 4>& y) const noexcept
-{
- return _mm_or_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 4> vec<f32, 4>::operator^(const vec<f32, 4>& y) const noexcept
-{
- return _mm_xor_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator+(const vec<f64, 2>& y) const noexcept
-{
- return _mm_add_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator-(const vec<f64, 2>& y) const noexcept
-{
- return _mm_sub_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator*(const vec<f64, 2>& y) const noexcept
-{
- return _mm_mul_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator/(const vec<f64, 2>& y) const noexcept
-{
- return _mm_div_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator&(const vec<f64, 2>& y) const noexcept
-{
- return _mm_and_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator|(const vec<f64, 2>& y) const noexcept
-{
- return _mm_or_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 2> vec<f64, 2>::operator^(const vec<f64, 2>& y) const noexcept
-{
- return _mm_xor_pd(simd, y.simd);
-}
-
-#endif // CMT_ARCH_SSE2
-
-#ifdef CMT_ARCH_AVX
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator+(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_add_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator-(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_sub_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator*(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_mul_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator/(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_div_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator&(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_and_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator|(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_or_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 8> vec<f32, 8>::operator^(const vec<f32, 8>& y) const noexcept
-{
- return _mm256_xor_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator+(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_add_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator-(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_sub_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator*(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_mul_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator/(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_div_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator&(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_and_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator|(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_or_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 4> vec<f64, 4>::operator^(const vec<f64, 4>& y) const noexcept
-{
- return _mm256_xor_pd(simd, y.simd);
-}
-
-#endif // CMT_ARCH_AVX
-
-#ifdef CMT_ARCH_AVX512
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator+(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_add_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator-(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_sub_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator*(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_mul_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator/(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_div_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator&(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_and_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator|(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_or_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f32, 16> vec<f32, 16>::operator^(const vec<f32, 16>& y) const noexcept
-{
- return _mm512_xor_ps(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator+(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_add_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator-(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_sub_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator*(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_mul_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator/(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_div_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator&(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_and_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator|(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_or_pd(simd, y.simd);
-}
-
-template <>
-KFR_I_CE CMT_INLINE vec<f64, 8> vec<f64, 8>::operator^(const vec<f64, 8>& y) const noexcept
-{
- return _mm512_xor_pd(simd, y.simd);
-}
-
-#endif // CMT_ARCH_AVX
-
-} // namespace kfr
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -1,315 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/sin_cos.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the trigonometric sine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sin(const T1& x)
-{
- return intrinsics::sin(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sin, E1> sin(E1&& x)
-{
- return { fn::sin(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cos(const T1& x)
-{
- return intrinsics::cos(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cos, E1> cos(E1&& x)
-{
- return { fn::cos(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric sine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastsin(const T1& x)
-{
- return intrinsics::fastsin(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric sine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastsin, E1> fastsin(E1&& x)
-{
- return { fn::fastsin(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric cosine of x.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastcos(const T1& x)
-{
- return intrinsics::fastcos(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric cosine of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastcos, E1> fastcos(E1&& x)
-{
- return { fn::fastcos(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
- * be a vector.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sincos(const T1& x)
-{
- return intrinsics::sincos(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of the even elements of the x and
- * cosine of the odd elements. x must be a vector.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sincos, E1> sincos(E1&& x)
-{
- return { fn::sincos(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
- * be a vector.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cossin(const T1& x)
-{
- return intrinsics::cossin(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of the even elements of the x and
- * sine of the odd elements. x must be a vector.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cossin, E1> cossin(E1&& x)
-{
- return { fn::cossin(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sindeg(const T1& x)
-{
- return intrinsics::sindeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of the x (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sindeg, E1> sindeg(E1&& x)
-{
- return { fn::sindeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cosdeg(const T1& x)
-{
- return intrinsics::cosdeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of the x (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cosdeg, E1> cosdeg(E1&& x)
-{
- return { fn::cosdeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric sine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastsindeg(const T1& x)
-{
- return intrinsics::fastsindeg(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric sine of the x
- * (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastsindeg, E1> fastsindeg(E1&& x)
-{
- return { fn::fastsindeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns an approximation of the trigonometric cosine of the x (expressed in degrees).
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> fastcosdeg(const T1& x)
-{
- return intrinsics::fastcosdeg(x);
-}
-
-/**
- * @brief Returns template expression that returns an approximation of the trigonometric cosine of the x
- * (expressed in degrees).
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::fastcosdeg, E1> fastcosdeg(E1&& x)
-{
- return { fn::fastcosdeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
- * be a vector and expressed in degrees.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sincosdeg(const T1& x)
-{
- return intrinsics::sincosdeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric sine of the even elements of the x and
- * cosine of the odd elements. x must be expressed in degrees.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sincosdeg, E1> sincosdeg(E1&& x)
-{
- return { fn::sincosdeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
- * be a vector and expressed in degrees.
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> cossindeg(const T1& x)
-{
- return intrinsics::cossindeg(x);
-}
-
-/**
- * @brief Returns template expression that returns the trigonometric cosine of the even elements of the x and
- * sine of the odd elements. x must be expressed in degrees.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::cossindeg, E1> cossindeg(E1&& x)
-{
- return { fn::cossindeg(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the sinc function of x.
- * \f[
- * sinc(x) = \frac{sin(x)}{x}
- * \f]
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> sinc(const T1& x)
-{
- return intrinsics::sinc(x);
-}
-
-/**
- * @brief Returns template expression that returns the sinc function of x.
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinc, E1> sinc(E1&& x)
-{
- return { fn::sinc(), std::forward<E1>(x) };
-}
-
-/**
- * @brief Returns the trigonometric sine of the angle 2x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T sin2x(const T& sinx, const T& cosx)
-{
- return 2 * sinx * cosx;
-}
-
-/**
- * @brief Returns the trigonometric sine of the angle 3x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T sin3x(const T& sinx, const T& cosx)
-{
- return sinx * (-1 + 4 * sqr(cosx));
-}
-
-/**
- * @brief Returns the trigonometric cosine of the angle 2x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T cos2x(const T& sinx, const T& cosx)
-{
- return sqr(cosx) - sqr(sinx);
-}
-
-/**
- * @brief Returns the trigonometric cosine of the angle 3x using sin(x) and cos(x).
- */
-template <typename T>
-KFR_SINTRIN T cos3x(const T& sinx, const T& cosx)
-{
- return cosx * (1 - 4 * sqr(sinx));
-}
-} // namespace kfr
diff --git a/include/kfr/base/small_buffer.hpp b/include/kfr/base/small_buffer.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup utility
+/** @addtogroup types
* @{
*/
/*
@@ -31,16 +31,15 @@
namespace kfr
{
-
template <typename T, std::size_t Capacity = 16>
struct small_buffer
{
public:
- small_buffer() noexcept : m_size(0), m_data(m_preallocated) {}
+ small_buffer() CMT_NOEXCEPT : m_size(0), m_data(m_preallocated) {}
small_buffer(std::size_t size) : small_buffer() { resize(size); }
- friend void swap(small_buffer<T, Capacity>& first, small_buffer<T, Capacity>& second) noexcept
+ friend void swap(small_buffer<T, Capacity>& first, small_buffer<T, Capacity>& second) CMT_NOEXCEPT
{
using std::swap;
diff --git a/include/kfr/base/sort.hpp b/include/kfr/base/sort.hpp
@@ -25,12 +25,15 @@
*/
#pragma once
-#include "min_max.hpp"
-#include "shuffle.hpp"
-#include "vec.hpp"
+#include "../math/min_max.hpp"
+#include "../simd/shuffle.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
/**
* @brief Sort the elements in the vector in ascending order
* @param x input vector
@@ -40,12 +43,12 @@ namespace kfr
* @endcode
*/
template <typename T, size_t N>
-CMT_INLINE vec<T, N> sort(const vec<T, N>& x)
+KFR_INTRINSIC vec<T, N> sort(const vec<T, N>& x)
{
constexpr size_t Nhalf = N / 2;
vec<T, Nhalf> e = low(x);
vec<T, Nhalf> o = high(x);
- constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>());
+ constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
for (size_t i = 0; i < Nhalf; i++)
{
vec<T, Nhalf> t;
@@ -73,12 +76,12 @@ CMT_INLINE vec<T, N> sort(const vec<T, N>& x)
* @endcode
*/
template <typename T, size_t N>
-CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
+KFR_INTRINSIC vec<T, N> sortdesc(const vec<T, N>& x)
{
constexpr size_t Nhalf = N / 2;
vec<T, Nhalf> e = low(x);
vec<T, Nhalf> o = high(x);
- constexpr auto blend0 = cconcat(csizes_t<1>(), csizeseq_t<Nhalf - 1, 0, 0>());
+ constexpr auto blend0 = cconcat(csizes<1>, csizeseq<Nhalf - 1, 0, 0>);
for (size_t i = 0; i < Nhalf; i++)
{
vec<T, Nhalf> t;
@@ -96,4 +99,5 @@ CMT_INLINE vec<T, N> sortdesc(const vec<T, N>& x)
}
return interleavehalfs(concat(e, o));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/base/specializations.i b/include/kfr/base/specializations.i
@@ -1,109 +0,0 @@
-/**
- * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
- * This file is part of KFR
- *
- * KFR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * KFR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with KFR.
- */
-#pragma once
-
-#include "vec.hpp"
-#ifndef KFR_SHUFFLE_SPECIALIZATIONS
-#include "shuffle.hpp"
-#endif
-
-#ifdef KFR_COMPILER_GNU
-
-namespace kfr
-{
-namespace internal
-{
-template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
- csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14,
- 15, 22, 23, 30, 31>,
- const vec<f32, 32>& x, const vec<f32, 32>&)
-{
- f32x32 w = x;
-
- w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)),
- permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w)));
-
- w = permutegroups<(4), 0, 4, 2, 6, 1, 5, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
- return w;
-}
-
-template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
- csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22,
- 23, 14, 15, 30, 31>,
- const vec<f32, 32>& x, const vec<f32, 32>&)
-{
- f32x32 w = x;
-
- w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)),
- permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w)));
-
- w = permutegroups<(4), 0, 4, 1, 5, 2, 6, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
- return w;
-}
-
-inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x)
-{
- return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18,
- 19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>,
- x, x);
-}
-
-template <>
-inline vec<f32, 64> shufflevector<f32, 64>(
- csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13,
- 44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38,
- 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>,
- const vec<f32, 64>& x, const vec<f32, 64>&)
-{
- return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x))));
-}
-
-template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>,
- const vec<f32, 16>& x, const vec<f32, 16>&)
-{
-// asm volatile("int $3");
- const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x);
-
- return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)), shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx)));
-}
-
-template <>
-inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>,
- const vec<f32, 16>& x, const vec<f32, 16>&)
-{
- const vec<f32, 16> xx = concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
-
- return permutegroups<(4), 0, 2, 1, 3>(xx);
-}
-
-template <>
-inline vec<f32, 32> shufflevector<f32, 32>(
- csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
- 29, 14, 30, 15, 31>,
- const vec<f32, 32>& x, const vec<f32, 32>&)
-{
- const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x);
-
- return concat(interleavehalfs(low(xx)), interleavehalfs(high(xx)));
-}
-}
-}
-#endif
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -1,50 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/sqrt.hpp"
-
-namespace kfr
-{
-
-/**
- * @brief Returns the positive square root of the x. \f$\sqrt{x}\f$
- */
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> sqrt(const T1& x)
-{
- return intrinsics::sqrt(x);
-}
-
-/**
- * @brief Returns template expression that returns the positive square root of the x. \f$\sqrt{x}\f$
- */
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::sqrt, E1> sqrt(E1&& x)
-{
- return { fn::sqrt(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -1,56 +0,0 @@
-/** @addtogroup math
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "impl/tan.hpp"
-
-namespace kfr
-{
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> tan(const T1& x)
-{
- return intrinsics::tan(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::tan, E1> tan(E1&& x)
-{
- return { fn::tan(), std::forward<E1>(x) };
-}
-
-template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC flt_type<T1> tandeg(const T1& x)
-{
- return intrinsics::tandeg(x);
-}
-
-template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::tandeg, E1> tandeg(E1&& x)
-{
- return { fn::tandeg(), std::forward<E1>(x) };
-}
-} // namespace kfr
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -1,429 +0,0 @@
-/** @addtogroup types
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-#include "kfr.h"
-
-#include "intrinsics.h"
-
-#include <cmath>
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
-
-#ifdef KFR_TESTING
-#include "../testo/testo.hpp"
-#endif
-
-#include "../cometa.hpp"
-
-#define KFR_ENABLE_IF CMT_ENABLE_IF
-
-/**
- * @brief Internal macro for functions
- */
-#define KFR_FN(FN) \
- namespace fn \
- { \
- struct FN \
- { \
- template <typename... Args> \
- CMT_INLINE_MEMBER decltype(::kfr::FN(std::declval<Args>()...)) operator()(Args&&... args) const \
- { \
- return ::kfr::FN(std::forward<Args>(args)...); \
- } \
- }; \
- }
-
-/**
- * @brief Internal macro for functions
- */
-#define KFR_I_FN(FN) \
- namespace fn \
- { \
- struct FN \
- { \
- template <typename... Args> \
- CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()( \
- Args&&... args) const \
- { \
- return ::kfr::intrinsics::FN(std::forward<Args>(args)...); \
- } \
- }; \
- }
-
-namespace kfr
-{
-// Include all from CoMeta library
-using namespace cometa;
-
-/// @brief Short names for common types
-using f32 = float;
-using f64 = double;
-using i8 = int8_t;
-using i16 = int16_t;
-using i32 = int32_t;
-using i64 = int64_t;
-using u8 = uint8_t;
-using u16 = uint16_t;
-using u32 = uint32_t;
-using u64 = uint64_t;
-using umax = uint64_t;
-using imax = int64_t;
-using fmax = double;
-using f80 = long double;
-
-#if defined(KFR_BASETYPE_F32) || defined(KFR_NO_NATIVE_F64)
-/// @brief Floating point type used by default
-using fbase = f32;
-#else
-/// @brief Floating point type used by default
-using fbase = f64;
-#endif
-
-constexpr ctype_t<f32> ctype_f32{};
-constexpr ctype_t<f64> ctype_f64{};
-constexpr ctype_t<i8> ctype_i8{};
-constexpr ctype_t<i16> ctype_i16{};
-constexpr ctype_t<i32> ctype_i32{};
-constexpr ctype_t<i64> ctype_i64{};
-constexpr ctype_t<u8> ctype_u8{};
-constexpr ctype_t<u16> ctype_u16{};
-constexpr ctype_t<u32> ctype_u32{};
-constexpr ctype_t<u64> ctype_u64{};
-constexpr ctype_t<umax> ctype_umax{};
-constexpr ctype_t<imax> ctype_imax{};
-constexpr ctype_t<fmax> ctype_fmax{};
-constexpr ctype_t<f80> ctype_f80{};
-constexpr ctype_t<fbase> ctype_base{};
-
-struct u24
-{
- u8 raw[3];
-};
-
-struct i24
-{
- u8 raw[3];
-
- i24(i32 x)
- {
- raw[0] = x & 0xFF;
- raw[1] = (x >> 8) & 0xFF;
- raw[2] = (x >> 16) & 0xFF;
- }
-
- i32 as_int() const
- {
- return static_cast<i32>(raw[0]) | static_cast<i32>(raw[1] << 8) |
- (static_cast<i32>(raw[2] << 24) >> 8);
- }
-
- operator int() const { return as_int(); }
-};
-
-struct f16
-{
- u16 raw;
-};
-
-/// @brief An enumeration representing data type
-template <typename T1>
-struct range
-{
- T1 min;
- T1 max;
- T1 distance() const { return max - min; }
-};
-
-/// @brief An enumeration representing data type
-enum class datatype : int
-{
- typebits_mask = 0xFF,
- f = 0x100,
- i = 0x200,
- u = 0x300,
- c = 0x400,
- typeclass_mask = 0xF00,
- x1 = 0x1000,
- x2 = 0x2000,
- x3 = 0x3000,
- x4 = 0x4000,
- typecomponents_mask = 0xF000,
- f16 = static_cast<int>(f) | static_cast<int>(x1) | 16,
- f32 = static_cast<int>(f) | static_cast<int>(x1) | 32,
- f64 = static_cast<int>(f) | static_cast<int>(x1) | 64,
- f80 = static_cast<int>(f) | static_cast<int>(x1) | 80,
- i8 = static_cast<int>(i) | static_cast<int>(x1) | 8,
- i16 = static_cast<int>(i) | static_cast<int>(x1) | 16,
- i24 = static_cast<int>(i) | static_cast<int>(x1) | 24,
- i32 = static_cast<int>(i) | static_cast<int>(x1) | 32,
- i64 = static_cast<int>(i) | static_cast<int>(x1) | 64,
- u8 = static_cast<int>(u) | static_cast<int>(x1) | 8,
- u16 = static_cast<int>(u) | static_cast<int>(x1) | 16,
- u24 = static_cast<int>(u) | static_cast<int>(x1) | 24,
- u32 = static_cast<int>(u) | static_cast<int>(x1) | 32,
- u64 = static_cast<int>(u) | static_cast<int>(x1) | 64,
- c32 = static_cast<int>(c) | static_cast<int>(x2) | 32,
- c64 = static_cast<int>(c) | static_cast<int>(x2) | 64
-};
-
-inline datatype operator|(datatype x, datatype y)
-{
- using type = underlying_type<datatype>;
- return static_cast<datatype>(static_cast<type>(x) | static_cast<type>(y));
-}
-
-inline datatype operator&(datatype x, datatype y)
-{
- using type = underlying_type<datatype>;
- return static_cast<datatype>(static_cast<type>(x) & static_cast<type>(y));
-}
-
-template <typename T>
-constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value
- ? datatype::f
- : std::is_integral<typename compound_type_traits<T>::subtype>::value
- ? (std::is_unsigned<typename compound_type_traits<T>::subtype>::value
- ? datatype::u
- : datatype::i)
- : datatype();
-
-template <typename T>
-using is_f_class = std::integral_constant<bool, typeclass<T> == datatype::f>;
-template <typename T>
-using is_u_class = std::integral_constant<bool, typeclass<T> == datatype::u>;
-template <typename T>
-using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>;
-
-template <typename T>
-struct typebits
-{
- static_assert(is_number<deep_subtype<T>>::value, "");
- constexpr static size_t bits = sizeof(typename compound_type_traits<T>::subtype) * 8;
- constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width;
- using subtype = typename compound_type_traits<T>::subtype;
-};
-
-namespace fn
-{
-///@copybrief cometa::pass_through
-using pass_through = cometa::fn_pass_through;
-
-///@copybrief cometa::noop
-using noop = cometa::fn_noop;
-
-///@copybrief cometa::get_first
-using get_first = cometa::fn_get_first;
-
-///@copybrief cometa::get_second
-using get_second = cometa::fn_get_second;
-
-///@copybrief cometa::get_third
-using get_third = cometa::fn_get_third;
-
-///@copybrief cometa::returns
-template <typename T>
-using returns = cometa::fn_returns<T>;
-} // namespace fn
-
-template <typename T>
-using ftype =
- typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>;
-template <typename T>
-using itype =
- typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>;
-template <typename T>
-using utype =
- typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>;
-
-template <typename T>
-using fsubtype = ftype<subtype<T>>;
-template <typename T>
-using isubtype = itype<subtype<T>>;
-template <typename T>
-using usubtype = utype<subtype<T>>;
-
-namespace internal
-{
-template <typename T>
-struct flt_type_impl
-{
- using type = fbase;
-};
-
-template <>
-struct flt_type_impl<float>
-{
- using type = float;
-};
-template <>
-struct flt_type_impl<double>
-{
- using type = double;
-};
-} // namespace internal
-
-template <typename T>
-using flt_type = typename internal::flt_type_impl<T>::type;
-
-namespace internal
-{
-#ifdef CMT_COMPILER_CLANG
-#define builtin_addressof(x) __builtin_addressof(x)
-#else
-template <class T>
-inline T* builtin_addressof(T& arg)
-{
- return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg)));
-}
-#endif
-
-#ifdef CMT_COMPILER_GNU
-CMT_INLINE f32 builtin_sqrt(f32 x) { return __builtin_sqrtf(x); }
-CMT_INLINE f64 builtin_sqrt(f64 x) { return __builtin_sqrt(x); }
-CMT_INLINE f80 builtin_sqrt(f80 x) { return __builtin_sqrtl(x); }
-CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size)
-{
- __builtin_memcpy(dest, src, size);
-}
-CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); }
-#else
-
-CMT_INLINE f32 builtin_sqrt(f32 x) { return ::sqrtf(x); }
-CMT_INLINE f64 builtin_sqrt(f64 x) { return ::sqrt(x); }
-CMT_INLINE f80 builtin_sqrt(f80 x) { return ::sqrtl(x); }
-CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) { ::memcpy(dest, src, size); }
-CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { ::memset(dest, val, size); }
-
-#endif
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes")
-
-template <typename T, bool A>
-struct struct_with_alignment
-{
- T value;
- KFR_INTRIN void operator=(T value) { this->value = value; }
-};
-
-template <typename T>
-struct struct_with_alignment<T, false>
-{
- T value;
- KFR_INTRIN void operator=(T value) { this->value = value; }
-}
-#ifdef CMT_GNU_ATTRIBUTES
-__attribute__((__packed__, __may_alias__)) //
-#endif
-;
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-} // namespace internal
-
-/// @brief Fills a value with zeros
-template <typename T1>
-CMT_INLINE void zeroize(T1& value)
-{
- internal::builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1));
-}
-
-/// @brief Used to determine the initial value for reduce functions
-template <typename T>
-struct initialvalue
-{
-};
-
-namespace internal
-{
-template <size_t width, typename Fn>
-CMT_INLINE void block_process_impl(size_t& i, size_t size, Fn&& fn)
-{
- CMT_LOOP_NOUNROLL
- for (; i < size / width * width; i += width)
- fn(i, csize_t<width>());
-}
-} // namespace internal
-
-template <size_t... widths, typename Fn>
-CMT_INLINE void block_process(size_t size, csizes_t<widths...>, Fn&& fn)
-{
- size_t i = 0;
- swallow{ (internal::block_process_impl<widths>(i, size, std::forward<Fn>(fn)), 0)... };
-}
-
-template <typename T>
-struct is_simd_type
- : std::integral_constant<
- bool, std::is_same<T, float>::value || std::is_same<T, double>::value ||
- std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value ||
- std::is_same<T, short>::value || std::is_same<T, unsigned short>::value ||
- std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
- std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
- std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value>
-{
-};
-
-template <typename T, size_t N>
-struct vec_t
-{
- static_assert(N > 0 && N <= 1024, "Invalid vector size");
-
- static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
-
- using value_type = T;
- constexpr static size_t size() noexcept { return N; }
- constexpr vec_t() noexcept = default;
-
- using scalar_type = subtype<T>;
- constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
-};
-
-constexpr size_t index_undefined = static_cast<size_t>(-1);
-
-struct czeros_t
-{
-};
-struct cones_t
-{
-};
-constexpr czeros_t czeros{};
-constexpr cones_t cones{};
-
-using caligned_t = cbool_t<true>;
-using cunaligned_t = cbool_t<false>;
-
-constexpr caligned_t caligned{};
-constexpr cunaligned_t cunaligned{};
-
-#ifdef CMT_INTRINSICS_IS_CONSTEXPR
-#define KFR_I_CE constexpr
-#else
-#define KFR_I_CE
-#endif
-} // namespace kfr
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp
@@ -27,10 +27,10 @@
#include "../cometa/array.hpp"
-#include "function.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/types.hpp"
#include "memory.hpp"
-#include "read_write.hpp"
-#include "types.hpp"
CMT_PRAGMA_MSVC(warning(push))
CMT_PRAGMA_MSVC(warning(disable : 4324))
@@ -97,20 +97,14 @@ struct univector_base : input_expression, output_expression
using output_expression::end_block;
template <typename U, size_t N>
- CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
+ KFR_MEM_INTRINSIC void operator()(coutput_t, size_t index, const vec<U, N>& value)
{
T* data = derived_cast<Class>(this)->data();
write(ptr_cast<T>(data) + index, vec<T, N>(value));
}
- template <typename U, size_t N>
- CMT_INLINE vec<U, N> operator()(cinput_t, size_t index, vec_t<U, N>) const
- {
- const T* data = derived_cast<Class>(this)->data();
- return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index));
- }
template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
- CMT_INLINE Class& operator=(Input&& input)
+ KFR_MEM_INTRINSIC Class& operator=(Input&& input)
{
assign_expr(std::forward<Input>(input));
return *derived_cast<Class>(this);
@@ -254,15 +248,15 @@ struct univector_base : input_expression, output_expression
protected:
template <typename Input>
- CMT_INLINE void assign_expr(Input&& input)
+ KFR_MEM_INTRINSIC void assign_expr(Input&& input)
{
process(*derived_cast<Class>(this), std::forward<Input>(input));
}
private:
- CMT_INLINE size_t get_size() const { return derived_cast<Class>(this)->size(); }
- CMT_INLINE const T* get_data() const { return derived_cast<Class>(this)->data(); }
- CMT_INLINE T* get_data() { return derived_cast<Class>(this)->data(); }
+ KFR_MEM_INTRINSIC size_t get_size() const { return derived_cast<Class>(this)->size(); }
+ KFR_MEM_INTRINSIC const T* get_data() const { return derived_cast<Class>(this)->data(); }
+ KFR_MEM_INTRINSIC T* get_data() { return derived_cast<Class>(this)->data(); }
static void copy(T* dest, const T* src, size_t size)
{
@@ -283,12 +277,12 @@ struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, S
this->assign_expr(std::forward<Input>(input));
}
template <typename... Args>
- constexpr univector(const T& x, const Args&... args) noexcept
+ constexpr univector(const T& x, const Args&... args) CMT_NOEXCEPT
: std::array<T, Size>{ { x, static_cast<T>(args)... } }
{
}
- constexpr univector() noexcept(noexcept(std::array<T, Size>())) = default;
+ constexpr univector() CMT_NOEXCEPT_SPEC(noexcept(std::array<T, Size>())) = default;
constexpr univector(size_t, const T& value) { std::fill(this->begin(), this->end(), value); }
constexpr static bool size_known = true;
constexpr static bool is_array = true;
@@ -298,13 +292,13 @@ struct alignas(platform<>::maximum_vector_alignment) univector : std::array<T, S
constexpr static bool is_pod = kfr::is_pod<T>::value;
using value_type = T;
- value_type get(size_t index, value_type fallback_value) const noexcept
+ value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT
{
return index < this->size() ? this->operator[](index) : fallback_value;
}
using univector_base<T, univector>::operator=;
- void resize(size_t) noexcept {}
+ void resize(size_t) CMT_NOEXCEPT {}
};
template <typename T>
@@ -334,7 +328,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T
constexpr univector(univector<U, Tag>& other) : array_ref<T>(other.data(), other.size())
{
}
- void resize(size_t) noexcept {}
+ void resize(size_t) CMT_NOEXCEPT {}
constexpr static bool size_known = false;
constexpr static bool is_array = false;
constexpr static bool is_array_ref = true;
@@ -342,7 +336,7 @@ struct univector<T, tag_array_ref> : array_ref<T>, univector_base<T, univector<T
constexpr static bool is_aligned = false;
using value_type = remove_const<T>;
- value_type get(size_t index, value_type fallback_value) const noexcept
+ value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT
{
return index < this->size() ? this->operator[](index) : fallback_value;
}
@@ -364,9 +358,11 @@ struct univector<T, tag_dynamic_vector> : std::vector<T, allocator<T>>,
this->resize(input.size());
this->assign_expr(std::forward<Input>(input));
}
- constexpr univector() noexcept(noexcept(std::vector<T, allocator<T>>())) = default;
+ constexpr univector() CMT_NOEXCEPT_SPEC(noexcept(std::vector<T, allocator<T>>())) = default;
constexpr univector(const std::vector<T, allocator<T>>& other) : std::vector<T, allocator<T>>(other) {}
- constexpr univector(std::vector<T, allocator<T>>&& other) : std::vector<T, allocator<T>>(std::move(other)) {}
+ constexpr univector(std::vector<T, allocator<T>>&& other) : std::vector<T, allocator<T>>(std::move(other))
+ {
+ }
constexpr univector(const array_ref<T>& other) : std::vector<T, allocator<T>>(other.begin(), other.end())
{
}
@@ -378,19 +374,19 @@ struct univector<T, tag_dynamic_vector> : std::vector<T, allocator<T>>,
constexpr univector(const std::vector<T, Allocator>&) = delete;
template <typename Allocator>
constexpr univector(std::vector<T, Allocator>&&) = delete;
- constexpr static bool size_known = false;
- constexpr static bool is_array = false;
- constexpr static bool is_array_ref = false;
- constexpr static bool is_vector = true;
- constexpr static bool is_aligned = true;
- using value_type = T;
+ constexpr static bool size_known = false;
+ constexpr static bool is_array = false;
+ constexpr static bool is_array_ref = false;
+ constexpr static bool is_vector = true;
+ constexpr static bool is_aligned = true;
+ using value_type = T;
- value_type get(size_t index, value_type fallback_value) const noexcept
+ value_type get(size_t index, value_type fallback_value) const CMT_NOEXCEPT
{
return index < this->size() ? this->operator[](index) : fallback_value;
}
template <typename Input, KFR_ENABLE_IF(is_input_expression<Input>::value)>
- CMT_INLINE univector& operator=(Input&& input)
+ KFR_MEM_INTRINSIC univector& operator=(Input&& input)
{
if (input.size() != infinite_size)
this->resize(input.size());
@@ -416,40 +412,18 @@ using univector3d = abstract_vector<abstract_vector<univector<T, Size3>, Size2>,
/// @brief Creates univector from data and size
template <typename T>
-CMT_INLINE univector_ref<T> make_univector(T* data, size_t size)
+KFR_INTRINSIC univector_ref<T> make_univector(T* data, size_t size)
{
return univector_ref<T>(data, size);
}
/// @brief Creates univector from data and size
template <typename T>
-CMT_INLINE univector_ref<const T> make_univector(const T* data, size_t size)
+KFR_INTRINSIC univector_ref<const T> make_univector(const T* data, size_t size)
{
return univector_ref<const T>(data, size);
}
-/// @brief Converts an expression to univector
-template <typename Expr, typename T = value_type_of<Expr>>
-CMT_INLINE univector<T> render(Expr&& expr)
-{
- static_assert(!is_infinite<Expr>::value,
- "render: Can't process infinite expressions. Pass size as a second argument to render.");
- univector<T> result;
- result.resize(expr.size());
- result = expr;
- return result;
-}
-
-/// @brief Converts an expression to univector
-template <typename Expr, typename T = value_type_of<Expr>>
-CMT_INLINE univector<T> render(Expr&& expr, size_t size, size_t offset = 0)
-{
- univector<T> result;
- result.resize(size);
- result = slice(expr, offset, size);
- return result;
-}
-
/// @brief Single producer single consumer lock-free ring buffer
template <typename T>
struct lockfree_ring_buffer
@@ -476,8 +450,8 @@ struct lockfree_ring_buffer
const size_t real_tail = cur_tail % buffer.size();
const size_t first_size = std::min(buffer.size() - real_tail, size);
- internal::builtin_memcpy(buffer.data() + real_tail, source, first_size * sizeof(T));
- internal::builtin_memcpy(buffer.data(), source + first_size, (size - first_size) * sizeof(T));
+ builtin_memcpy(buffer.data() + real_tail, source, first_size * sizeof(T));
+ builtin_memcpy(buffer.data(), source + first_size, (size - first_size) * sizeof(T));
std::atomic_thread_fence(std::memory_order_release);
@@ -500,8 +474,8 @@ struct lockfree_ring_buffer
const size_t real_front = cur_front % buffer.size();
const size_t first_size = std::min(buffer.size() - real_front, size);
- internal::builtin_memcpy(dest, buffer.data() + real_front, first_size * sizeof(T));
- internal::builtin_memcpy(dest + first_size, buffer.data(), (size - first_size) * sizeof(T));
+ builtin_memcpy(dest, buffer.data() + real_front, first_size * sizeof(T));
+ builtin_memcpy(dest + first_size, buffer.data(), (size - first_size) * sizeof(T));
std::atomic_thread_fence(std::memory_order_release);
@@ -514,6 +488,47 @@ private:
char cacheline_filler[64 - sizeof(std::atomic<size_t>)];
std::atomic<size_t> tail;
};
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, univector_tag Tag, typename U, size_t N>
+KFR_INTRINSIC vec<U, N> get_elements(const univector<T, Tag>& self, cinput_t, size_t index, vec_shape<U, N>)
+{
+ const T* data = self.data();
+ return static_cast<vec<U, N>>(read<N>(ptr_cast<T>(data) + index));
+}
+
+/// @brief Converts an expression to univector
+template <typename Expr, typename T = value_type_of<Expr>>
+KFR_INTRINSIC univector<T> render(Expr&& expr)
+{
+ static_assert(!is_infinite<Expr>::value,
+ "render: Can't process infinite expressions. Pass size as a second argument to render.");
+ univector<T> result;
+ result.resize(expr.size());
+ result = expr;
+ return result;
+}
+
+/// @brief Converts an expression to univector
+template <typename Expr, typename T = value_type_of<Expr>>
+KFR_INTRINSIC univector<T> render(Expr&& expr, size_t size, size_t offset = 0)
+{
+ univector<T> result;
+ result.resize(size);
+ result = slice(expr, offset, size);
+ return result;
+}
+
+/// @brief Converts an expression to univector
+template <typename Expr, size_t Size, typename T = value_type_of<Expr>>
+KFR_INTRINSIC univector<T, Size> render(Expr&& expr, csize_t<Size>)
+{
+ univector<T, Size> result;
+ result = expr;
+ return result;
+}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -1,1171 +0,0 @@
-/** @addtogroup types
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "kfr.h"
-
-#include "constants.hpp"
-#include "platform.hpp"
-#include "types.hpp"
-
-namespace kfr
-{
-
-template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
-CMT_INLINE vec<T, Nout> low(const vec<T, N>& x);
-template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
-CMT_INLINE vec<T, Nout> high(const vec<T, N>& x);
-} // namespace kfr
-
-#ifdef CMT_COMPILER_CLANG
-#include "simd_clang.hpp"
-#else
-#include "simd_intrin.hpp"
-#ifdef CMT_ARCH_X86
-#include "simd_x86.hpp"
-#endif
-#endif
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked")
-
-CMT_PRAGMA_MSVC(warning(push))
-CMT_PRAGMA_MSVC(warning(disable : 4814))
-
-namespace kfr
-{
-
-template <typename T>
-using maskfor = typename T::mask_t;
-
-template <typename T, size_t N>
-struct mask : protected vec<T, N>
-{
- using base = vec<T, N>;
- KFR_I_CE mask() noexcept = default;
- KFR_I_CE mask(const mask&) noexcept = default;
- KFR_I_CE mask& operator=(const mask&) noexcept = default;
- using simd_type = typename base::simd_type;
-
- simd_type operator*() const noexcept { return this->simd; }
- simd_type& operator*() noexcept { return this->simd; }
-
- KFR_I_CE mask(const base& v) noexcept
- //: base(base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()))
- {
- this->simd = *base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec());
- }
-
- KFR_I_CE mask(const simd_type& simd) : base(simd) {}
- template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
- KFR_I_CE mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
- {
- }
- template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
- KFR_I_CE mask(const vec<U, N>& m) : base(base::frombits(m))
- {
- }
- KFR_I_CE mask operator&(const mask& y) const noexcept
- {
- return static_cast<const base&>(*this) & static_cast<const base&>(y);
- }
- KFR_I_CE mask operator|(const mask& y) const noexcept
- {
- return static_cast<const base&>(*this) | static_cast<const base&>(y);
- }
- KFR_I_CE mask operator&&(const mask& y) const noexcept
- {
- return static_cast<const base&>(*this) & static_cast<const base&>(y);
- }
- KFR_I_CE mask operator||(const mask& y) const noexcept
- {
- return static_cast<const base&>(*this) | static_cast<const base&>(y);
- }
- KFR_I_CE mask operator^(const mask& y) const noexcept
- {
- return static_cast<const base&>(*this) ^ static_cast<const base&>(y);
- }
- KFR_I_CE mask operator~() const noexcept { return ~static_cast<const base&>(*this); }
-
- bool operator[](size_t index) const noexcept;
-
- constexpr base asvec() const noexcept { return reinterpret_cast<const base&>(*this); }
-};
-
-namespace internal
-{
-
-constexpr inline size_t scale_get_index(size_t counter, size_t groupsize, size_t index)
-{
- return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index);
-}
-
-template <size_t counter, size_t groupsize, size_t... indices>
-constexpr inline size_t scale_get_index(csizes_t<indices...>)
-{
- return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
-}
-
-template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
-constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) noexcept
- -> csizes_t<scale_get_index<counter, groupsize>(ind)...>
-{
- return {};
-}
-} // namespace internal
-
-template <size_t groupsize, size_t... indices>
-constexpr inline auto scale() noexcept
-{
- return internal::scale_impl(csizes_t<indices...>(), csizeseq_t<sizeof...(indices) * groupsize>());
-}
-
-template <typename T, size_t Nin, size_t N>
-struct vec<vec<T, Nin>, N> : private vec<T, Nin * N>
-{
- using base = vec<T, Nin * N>;
-
- using value_type = vec<T, Nin>;
- constexpr static size_t size() noexcept { return N; }
-
- using scalar_type = T;
- constexpr static size_t scalar_size() noexcept { return Nin * N; }
-
- using simd_type = typename base::simd_type;
-
- constexpr vec() noexcept = default;
- constexpr vec(const vec&) noexcept = default;
- CMT_GNU_CONSTEXPR vec& operator=(const vec&) CMT_GNU_NOEXCEPT = default;
- constexpr vec(const simd_type& simd) noexcept : base(simd) {}
- constexpr vec(czeros_t) noexcept : base(czeros) {}
- constexpr vec(cones_t) noexcept : base(cones) {}
-
- constexpr vec(const value_type& v) noexcept : base(v.shuffle(csizeseq_t<Nin * N>() % csize_t<Nin>())) {}
-
- template <int = 0>
- explicit constexpr vec(const vec<T, Nin * N>& v) noexcept : base(v)
- {
- }
-
- // from list of vectors
- template <typename... Us>
- constexpr vec(const value_type& s0, const value_type& s1, const Us&... rest) noexcept
- : base(s0, s1, rest...)
- {
- }
-
- template <typename U>
- constexpr vec(const vec<vec<U, Nin>, N>& v) noexcept : base(static_cast<vec<T, Nin * N>>(v.flatten()))
- {
- }
-
- template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
- constexpr static vec frombits(const vec<U, M>& v) noexcept
- {
- return vec(base::frombits(v.flatten()));
- }
-
- // math / bitwise / comparison operators
- constexpr friend vec operator+(const vec& x) noexcept { return x; }
- constexpr friend vec operator-(const vec& x) noexcept { return base::operator-(x); }
- constexpr friend vec operator~(const vec& x) noexcept { return base::operator~(x); }
-
-#define KFR_B(x) static_cast<const base&>(x)
-
- constexpr friend vec operator+(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) + KFR_B(y)); }
- constexpr friend vec operator-(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) - KFR_B(y)); }
- constexpr friend vec operator*(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) * KFR_B(y)); }
- constexpr friend vec operator/(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) / KFR_B(y)); }
-
- constexpr friend vec operator<<(const vec& x, int shift) noexcept { return vec(KFR_B(x) << shift); }
- constexpr friend vec operator>>(const vec& x, int shift) noexcept { return vec(KFR_B(x) >> shift); }
- constexpr friend vec operator&(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) & KFR_B(y)); }
- constexpr friend vec operator|(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) | KFR_B(y)); }
- constexpr friend vec operator^(const vec& x, const vec& y) noexcept { return vec(KFR_B(x) ^ KFR_B(y)); }
-
-#undef KFR_B
-
- constexpr friend vec& operator+=(vec& x, const vec& y) noexcept { return x = x + y; }
- constexpr friend vec& operator-=(vec& x, const vec& y) noexcept { return x = x - y; }
- constexpr friend vec& operator*=(vec& x, const vec& y) noexcept { return x = x * y; }
- constexpr friend vec& operator/=(vec& x, const vec& y) noexcept { return x = x / y; }
-
- constexpr friend vec& operator<<=(vec& x, int shift) noexcept { return x = x << shift; }
- constexpr friend vec& operator>>=(vec& x, int shift) noexcept { return x = x >> shift; }
- constexpr friend vec& operator&=(vec& x, const vec& y) noexcept { return x = x & y; }
- constexpr friend vec& operator|=(vec& x, const vec& y) noexcept { return x = x | y; }
- constexpr friend vec& operator^=(vec& x, const vec& y) noexcept { return x = x ^ y; }
-
- constexpr friend vec& operator++(vec& x) noexcept { return x = x + vec(1); }
- constexpr friend vec& operator--(vec& x) noexcept { return x = x - vec(1); }
- constexpr friend vec operator++(vec& x, int) noexcept
- {
- const vec z = x;
- ++x;
- return z;
- }
- constexpr friend vec operator--(vec& x, int) noexcept
- {
- const vec z = x;
- --x;
- return z;
- }
-
- // shuffle
- template <size_t... indices>
- constexpr vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...>) const noexcept
- {
- return *base::shuffle(scale<Nin, indices...>());
- }
- template <size_t... indices>
- constexpr vec<value_type, sizeof...(indices)> shuffle(const vec& y, csizes_t<indices...>) const noexcept
- {
- return *base::shuffle(y, scale<Nin, indices...>());
- }
-
- // element access
- struct element;
- CMT_GNU_CONSTEXPR value_type operator[](size_t index) const noexcept { return get(index); }
- CMT_GNU_CONSTEXPR element operator[](size_t index) noexcept { return { *this, index }; }
-
- CMT_GNU_CONSTEXPR value_type get(size_t index) const noexcept
- {
- return reinterpret_cast<const value_type(&)[N]>(*this)[index];
- }
- CMT_GNU_CONSTEXPR void set(size_t index, const value_type& s) noexcept
- {
- reinterpret_cast<value_type(&)[N]>(*this)[index] = s;
- }
- template <size_t index>
- CMT_GNU_CONSTEXPR value_type get(csize_t<index>) const noexcept
- {
- return static_cast<const base&>(*this).shuffle(csizeseq_t<Nin, index * Nin>());
- }
- template <size_t index>
- CMT_GNU_CONSTEXPR void set(csize_t<index>, const value_type& s) noexcept
- {
- *this = vec(static_cast<const base&>(*this))
- .shuffle(s, csizeseq_t<N>() + (csizeseq_t<N>() >= csize_t<index * Nin>() &&
- csizeseq_t<N>() < csize_t<(index + 1) * Nin>()) *
- N);
- }
- struct element
- {
- constexpr operator value_type() const noexcept { return v.get(index); }
- element& operator=(const value_type& s) noexcept
- {
- v.set(index, s);
- return *this;
- }
- vec& v;
- size_t index;
- };
-
- template <bool aligned = false>
- explicit constexpr vec(const value_type* src, cbool_t<aligned> = cbool_t<aligned>()) noexcept
- : base(ptr_cast<T>(src), cbool_t<aligned>())
- {
- }
- template <bool aligned = false>
- const vec& write(value_type* dest, cbool_t<aligned> = cbool_t<aligned>()) const noexcept
- {
- base::write(ptr_cast<T>(dest), cbool_t<aligned>());
- return *this;
- }
-
- const base& flatten() const noexcept { return *this; }
- simd_type operator*() const noexcept { return base::operator*(); }
- simd_type& operator*() noexcept { return base::operator*(); }
-};
-
-namespace internal
-{
-
-template <typename T>
-constexpr inline T maskbits(bool value)
-{
- return value ? constants<T>::allones() : T();
-}
-
-template <typename T, size_t N>
-struct flt_type_impl<vec<T, N>>
-{
- using type = vec<typename flt_type_impl<T>::type, N>;
-};
-
-template <typename T>
-struct is_vec_impl : std::false_type
-{
-};
-
-template <typename T, size_t N>
-struct is_vec_impl<vec<T, N>> : std::true_type
-{
-};
-} // namespace internal
-
-template <typename T>
-using is_vec = internal::is_vec_impl<T>;
-
-template <typename To, typename From, size_t N,
- KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value),
- size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width>
-constexpr CMT_INLINE vec<To, Nout> compcast(const vec<From, N>& value) noexcept
-{
- return vec<To, Nout>(value.flatten());
-}
-
-#ifdef KFR_ENABLE_SWIZZLE
-namespace swizzle
-{
-template <size_t>
-struct swiz
-{
- constexpr swiz() {}
-};
-
-constexpr swiz<0> x{};
-constexpr swiz<1> y{};
-constexpr swiz<2> z{};
-constexpr swiz<3> w{};
-constexpr swiz<0> r{};
-constexpr swiz<1> g{};
-constexpr swiz<2> b{};
-constexpr swiz<3> a{};
-constexpr swiz<0> s{};
-constexpr swiz<1> t{};
-constexpr swiz<2> p{};
-constexpr swiz<3> q{};
-
-constexpr swiz<0> s0{};
-constexpr swiz<1> s1{};
-constexpr swiz<2> s2{};
-constexpr swiz<3> s3{};
-constexpr swiz<4> s4{};
-constexpr swiz<5> s5{};
-constexpr swiz<6> s6{};
-constexpr swiz<7> s7{};
-constexpr swiz<8> s8{};
-constexpr swiz<9> s9{};
-constexpr swiz<10> s10{};
-constexpr swiz<11> s11{};
-constexpr swiz<12> s12{};
-constexpr swiz<13> s13{};
-constexpr swiz<14> s14{};
-constexpr swiz<15> s15{};
-} // namespace swizzle
-#endif
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast")
-
-template <size_t N, typename T>
-constexpr CMT_INLINE vec<T, N> broadcast(T x)
-{
- return x;
-}
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-
-namespace internal
-{
-
-template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>,
- size_t Nout = N* compound_type_traits<To>::deep_width>
-constexpr CMT_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept
-{
- return vec<To, N>(value);
-}
-
-// scalar to scalar
-template <typename To, typename From>
-struct conversion
-{
- static_assert(std::is_convertible<From, To>::value, "");
- static To cast(const From& value) { return value; }
-};
-
-// vector to vector
-template <typename To, typename From, size_t N>
-struct conversion<vec<To, N>, vec<From, N>>
-{
- static_assert(!is_compound<To>::value, "");
- static_assert(!is_compound<From>::value, "");
- static vec<To, N> cast(const vec<From, N>& value) { return builtin_convertvector<To>(value); }
-};
-
-// vector<vector> to vector<vector>
-template <typename To, typename From, size_t N1, size_t N2>
-struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, N1>, N2>>
-{
- static_assert(!is_compound<To>::value, "");
- static_assert(!is_compound<From>::value, "");
- static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
- {
- return builtin_convertvector<vec<To, N1>>(value);
- }
-};
-
-// scalar to vector
-template <typename To, typename From, size_t N>
-struct conversion<vec<To, N>, From>
-{
- static_assert(std::is_convertible<From, To>::value, "");
- static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
-};
-} // namespace internal
-
-template <typename T>
-constexpr size_t size_of() noexcept
-{
- return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width;
-}
-
-template <typename From, size_t N, typename Tsub = deep_subtype<From>,
- size_t Nout = N* size_of<From>() / size_of<Tsub>()>
-constexpr CMT_INLINE vec<Tsub, Nout> flatten(const vec<From, N>& x) noexcept
-{
- return x.flatten();
-}
-
-template <typename To, typename From,
- typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
-constexpr CMT_INLINE Tout cast(const From& value) noexcept
-{
- return static_cast<Tout>(value);
-}
-
-template <typename To, typename From>
-CMT_GNU_CONSTEXPR CMT_INLINE To bitcast(const From& value) noexcept
-{
- static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
- union {
- From from;
- To to;
- } u{ value };
- return u.to;
-}
-
-template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
-{
- return vec<To, Nout>::frombits(value);
-}
-
-template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr CMT_INLINE To ubitcast(const From& value) noexcept
-{
- return bitcast<To>(value);
-}
-
-template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr CMT_INLINE To ibitcast(const From& value) noexcept
-{
- return bitcast<To>(value);
-}
-
-template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
-constexpr CMT_INLINE To fbitcast(const From& value) noexcept
-{
- return bitcast<To>(value);
-}
-
-template <typename From, size_t N, typename To = utype<From>,
- size_t Nout = size_of<From>() * N / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> ubitcast(const vec<From, N>& value) noexcept
-{
- return vec<To, Nout>::frombits(value);
-}
-
-template <typename From, size_t N, typename To = itype<From>,
- size_t Nout = size_of<From>() * N / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> ibitcast(const vec<From, N>& value) noexcept
-{
- return vec<To, Nout>::frombits(value);
-}
-
-template <typename From, size_t N, typename To = ftype<From>,
- size_t Nout = size_of<From>() * N / size_of<To>()>
-constexpr CMT_INLINE vec<To, Nout> fbitcast(const vec<From, N>& value) noexcept
-{
- return vec<To, Nout>::frombits(value);
-}
-
-template <typename T, size_t N>
-inline bool mask<T, N>::operator[](size_t index) const noexcept
-{
- return ibitcast(base::operator[](index)) < 0;
-}
-
-constexpr CMT_INLINE size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
-
-namespace internal
-{
-template <size_t start = 0, size_t stride = 1>
-struct shuffle_index
-{
- constexpr CMT_INLINE size_t operator()(size_t index) const { return start + index * stride; }
-};
-
-template <size_t count, size_t start = 0, size_t stride = 1>
-struct shuffle_index_wrap
-{
- constexpr inline size_t operator()(size_t index) const { return (start + index * stride) % count; }
-};
-} // namespace internal
-
-template <size_t count, typename T, size_t N, size_t Nout = N* count>
-CMT_INLINE vec<T, Nout> repeat(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<Nout>() % csize_t<N>());
-}
-KFR_FN(repeat)
-
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)>
-CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<Nout>() % csize_t<N>());
-}
-template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)>
-constexpr CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
-{
- return x;
-}
-KFR_FN(resize)
-
-template <typename T, size_t N>
-struct pkd_vec
-{
- constexpr pkd_vec() noexcept {}
- pkd_vec(const vec<T, N>& value) noexcept { value.write(v); }
- template <typename... Ts>
- constexpr pkd_vec(Ts... init) noexcept : v{ static_cast<T>(init)... }
- {
- static_assert(N <= sizeof...(Ts), "Too few initializers for pkd_vec");
- }
-
-private:
- T v[N];
- friend struct vec<T, N>;
-}
-#ifdef CMT_GNU_ATTRIBUTES
-__attribute__((packed))
-#endif
-;
-
-namespace internal
-{
-
-template <size_t, typename T>
-constexpr CMT_INLINE T make_vector_get_n()
-{
- return T();
-}
-template <size_t index, typename T, typename... Args>
-constexpr CMT_INLINE T make_vector_get_n(const T& arg, const Args&... args)
-{
- return index == 0 ? arg : make_vector_get_n<index - 1, T>(args...);
-}
-
-template <typename T, typename... Args, size_t... indices, size_t N = sizeof...(Args)>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> make_vector_impl(csizes_t<indices...>, const Args&... args)
-{
- const T list[] = { static_cast<T>(args)... };
- return vec<T, N>(list[indices]...);
-}
-} // namespace internal
-
-/// Create vector from scalar values
-/// @code
-/// CHECK( make_vector( 1, 2, 3, 4 ) == i32x4{1, 2, 3, 4} );
-/// @endcode
-template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
- typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>
-constexpr CMT_INLINE vec<SubType, N> make_vector(const Arg& x, const Args&... rest)
-{
- return internal::make_vector_impl<SubType>(cvalseq_t<size_t, N>(), static_cast<SubType>(x),
- static_cast<SubType>(rest)...);
-}
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> make_vector(const vec<T, N>& x)
-{
- return x;
-}
-template <typename T, T... Values, size_t N = sizeof...(Values)>
-constexpr CMT_INLINE vec<T, N> make_vector(cvals_t<T, Values...>)
-{
- return make_vector<T>(Values...);
-}
-KFR_FN(make_vector)
-
-template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
- typename SubType = conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>,
- KFR_ENABLE_IF(is_number<subtype<SubType>>::value)>
-constexpr CMT_INLINE vec<SubType, N> pack(const Arg& x, const Args&... rest)
-{
- return internal::make_vector_impl<SubType>(csizeseq_t<N * widthof<SubType>()>(), static_cast<SubType>(x),
- static_cast<SubType>(rest)...);
-}
-KFR_FN(pack)
-
-namespace operators
-{
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator+(const vec<T1, N>& x, const T2& y)
-{
- return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator-(const vec<T1, N>& x, const T2& y)
-{
- return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator*(const vec<T1, N>& x, const T2& y)
-{
- return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator/(const vec<T1, N>& x, const T2& y)
-{
- return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y);
-}
-
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator+(const T1& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator-(const T1& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator*(const T1& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator/(const T1& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y);
-}
-
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator+(const vec<T1, N>& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) + static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator-(const vec<T1, N>& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) - static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator*(const vec<T1, N>& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) * static_cast<vec<C, N>>(y);
-}
-template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>>
-constexpr CMT_INLINE vec<C, N> operator/(const vec<T1, N>& x, const vec<T2, N>& y)
-{
- return static_cast<vec<C, N>>(x) / static_cast<vec<C, N>>(y);
-}
-
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator&&(const T1& x, const vec<T1, N>& y)
-{
- return static_cast<vec<T1, N>>(x) && y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator||(const T1& x, const vec<T1, N>& y)
-{
- return static_cast<vec<T1, N>>(x) || y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator&(const T1& x, const vec<T1, N>& y)
-{
- return static_cast<vec<T1, N>>(x) & y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator|(const T1& x, const vec<T1, N>& y)
-{
- return static_cast<vec<T1, N>>(x) | y;
-}
-template <typename T1, size_t N>
-constexpr CMT_INLINE vec<T1, N> operator^(const T1& x, const vec<T1, N>& y)
-{
- return static_cast<vec<T1, N>>(x) ^ y;
-}
-} // namespace operators
-
-using namespace operators;
-
-template <typename T, size_t N1, size_t N2 = N1>
-using mat = vec<vec<T, N1>, N2>;
-
-namespace internal
-{
-
-template <size_t start, size_t count>
-struct shuffle_index_extend
-{
- constexpr CMT_INLINE size_t operator()(size_t index) const
- {
- return index >= start && index < start + count ? index - start : index_undefined;
- }
-};
-
-template <typename T, size_t Nout, size_t N1, size_t... indices>
-constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>)
-{
- return make_vector(maskbits<T>(indices < N1)...);
-}
-template <typename T, size_t Nout, size_t N1>
-constexpr vec<T, Nout> partial_mask()
-{
- return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
-}
-} // namespace internal
-
-template <typename T>
-using optvec = vec<T, platform<T>::vector_capacity / 4>;
-
-using f32x1 = vec<f32, 1>;
-using f32x2 = vec<f32, 2>;
-using f32x3 = vec<f32, 3>;
-using f32x4 = vec<f32, 4>;
-using f32x8 = vec<f32, 8>;
-using f32x16 = vec<f32, 16>;
-using f32x32 = vec<f32, 32>;
-using f32x64 = vec<f32, 64>;
-using f64x1 = vec<f64, 1>;
-using f64x2 = vec<f64, 2>;
-using f64x3 = vec<f64, 3>;
-using f64x4 = vec<f64, 4>;
-using f64x8 = vec<f64, 8>;
-using f64x16 = vec<f64, 16>;
-using f64x32 = vec<f64, 32>;
-using f64x64 = vec<f64, 64>;
-using i8x1 = vec<i8, 1>;
-using i8x2 = vec<i8, 2>;
-using i8x3 = vec<i8, 3>;
-using i8x4 = vec<i8, 4>;
-using i8x8 = vec<i8, 8>;
-using i8x16 = vec<i8, 16>;
-using i8x32 = vec<i8, 32>;
-using i8x64 = vec<i8, 64>;
-using i16x1 = vec<i16, 1>;
-using i16x2 = vec<i16, 2>;
-using i16x3 = vec<i16, 3>;
-using i16x4 = vec<i16, 4>;
-using i16x8 = vec<i16, 8>;
-using i16x16 = vec<i16, 16>;
-using i16x32 = vec<i16, 32>;
-using i16x64 = vec<i16, 64>;
-using i32x1 = vec<i32, 1>;
-using i32x2 = vec<i32, 2>;
-using i32x3 = vec<i32, 3>;
-using i32x4 = vec<i32, 4>;
-using i32x8 = vec<i32, 8>;
-using i32x16 = vec<i32, 16>;
-using i32x32 = vec<i32, 32>;
-using i32x64 = vec<i32, 64>;
-using i64x1 = vec<i64, 1>;
-using i64x2 = vec<i64, 2>;
-using i64x3 = vec<i64, 3>;
-using i64x4 = vec<i64, 4>;
-using i64x8 = vec<i64, 8>;
-using i64x16 = vec<i64, 16>;
-using i64x32 = vec<i64, 32>;
-using i64x64 = vec<i64, 64>;
-using u8x1 = vec<u8, 1>;
-using u8x2 = vec<u8, 2>;
-using u8x3 = vec<u8, 3>;
-using u8x4 = vec<u8, 4>;
-using u8x8 = vec<u8, 8>;
-using u8x16 = vec<u8, 16>;
-using u8x32 = vec<u8, 32>;
-using u8x64 = vec<u8, 64>;
-using u16x1 = vec<u16, 1>;
-using u16x2 = vec<u16, 2>;
-using u16x3 = vec<u16, 3>;
-using u16x4 = vec<u16, 4>;
-using u16x8 = vec<u16, 8>;
-using u16x16 = vec<u16, 16>;
-using u16x32 = vec<u16, 32>;
-using u16x64 = vec<u16, 64>;
-using u32x1 = vec<u32, 1>;
-using u32x2 = vec<u32, 2>;
-using u32x3 = vec<u32, 3>;
-using u32x4 = vec<u32, 4>;
-using u32x8 = vec<u32, 8>;
-using u32x16 = vec<u32, 16>;
-using u32x32 = vec<u32, 32>;
-using u32x64 = vec<u32, 64>;
-using u64x1 = vec<u64, 1>;
-using u64x2 = vec<u64, 2>;
-using u64x3 = vec<u64, 3>;
-using u64x4 = vec<u64, 4>;
-using u64x8 = vec<u64, 8>;
-using u64x16 = vec<u64, 16>;
-using u64x32 = vec<u64, 32>;
-using u64x64 = vec<u64, 64>;
-
-using u8x2x2 = vec<vec<u8, 2>, 2>;
-using i8x2x2 = vec<vec<i8, 2>, 2>;
-using u16x2x2 = vec<vec<u16, 2>, 2>;
-using i16x2x2 = vec<vec<i16, 2>, 2>;
-using u32x2x2 = vec<vec<u32, 2>, 2>;
-using i32x2x2 = vec<vec<i32, 2>, 2>;
-using u64x2x2 = vec<vec<u64, 2>, 2>;
-using i64x2x2 = vec<vec<i64, 2>, 2>;
-using f32x2x2 = vec<vec<f32, 2>, 2>;
-using f64x2x2 = vec<vec<f64, 2>, 2>;
-
-using u8x4x4 = vec<vec<u8, 4>, 4>;
-using i8x4x4 = vec<vec<i8, 4>, 4>;
-using u16x4x4 = vec<vec<u16, 4>, 4>;
-using i16x4x4 = vec<vec<i16, 4>, 4>;
-using u32x4x4 = vec<vec<u32, 4>, 4>;
-using i32x4x4 = vec<vec<i32, 4>, 4>;
-using u64x4x4 = vec<vec<u64, 4>, 4>;
-using i64x4x4 = vec<vec<i64, 4>, 4>;
-using f32x4x4 = vec<vec<f32, 4>, 4>;
-using f64x4x4 = vec<vec<f64, 4>, 4>;
-
-namespace glsl_names
-{
-using vec2 = f32x2;
-using vec3 = f32x3;
-using vec4 = f32x4;
-using dvec2 = f64x2;
-using dvec3 = f64x3;
-using dvec4 = f64x4;
-using ivec2 = i32x2;
-using ivec3 = i32x3;
-using ivec4 = i32x4;
-using uvec2 = u32x2;
-using uvec3 = u32x3;
-using uvec4 = u32x4;
-} // namespace glsl_names
-namespace opencl_names
-{
-using char2 = i8x2;
-using char3 = i8x3;
-using char4 = i8x4;
-using char8 = i8x8;
-using char16 = i8x16;
-using uchar2 = u8x2;
-using uchar3 = u8x3;
-using uchar4 = u8x4;
-using uchar8 = u8x8;
-using uchar16 = u8x16;
-
-using short2 = i16x2;
-using short3 = i16x3;
-using short4 = i16x4;
-using short8 = i16x8;
-using short16 = i16x16;
-using ushort2 = u16x2;
-using ushort3 = u16x3;
-using ushort4 = u16x4;
-using ushort8 = u16x8;
-using ushort16 = u16x16;
-
-using int2 = i32x2;
-using int3 = i32x3;
-using int4 = i32x4;
-using int8 = i32x8;
-using int16 = i32x16;
-using uint2 = u32x2;
-using uint3 = u32x3;
-using uint4 = u32x4;
-using uint8 = u32x8;
-using uint16 = u32x16;
-
-using long2 = i64x2;
-using long3 = i64x3;
-using long4 = i64x4;
-using long8 = i64x8;
-using long16 = i64x16;
-using ulong2 = u64x2;
-using ulong3 = u64x3;
-using ulong4 = u64x4;
-using ulong8 = u64x8;
-using ulong16 = u64x16;
-
-using float2 = f32x2;
-using float3 = f32x3;
-using float4 = f32x4;
-using float8 = f32x8;
-using float16 = f32x16;
-
-using double2 = f64x2;
-using double3 = f64x3;
-using double4 = f64x4;
-using double8 = f64x8;
-using double16 = f64x16;
-} // namespace opencl_names
-
-namespace internal
-{
-
-template <typename T, size_t N>
-struct vec_type
-{
- using type = vec<T, N>;
-};
-
-template <typename T, size_t Nmax>
-struct maxvec
-{
- constexpr static size_t size = Nmax;
- vec<T, size> vmax;
- maxvec(T initial) : vmax(initial) {}
- template <int N>
- vec<T, N>& v()
- {
- static_assert(N <= size, "N <= size");
- return reinterpret_cast<vec<T, N>&>(*this);
- }
- template <int N>
- const vec<T, N>& v() const
- {
- static_assert(N <= size, "N <= size");
- return reinterpret_cast<const vec<T, N>&>(*this);
- }
-};
-
-template <size_t Index, typename T, size_t N, typename Fn, typename... Args,
- typename Tout = result_of<Fn(subtype<decay<Args>>...)>>
-constexpr CMT_INLINE Tout applyfn_helper(Fn&& fn, Args&&... args)
-{
- return fn(args[Index]...);
-}
-
-template <typename T, size_t N, typename Fn, typename... Args,
- typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices>
-constexpr CMT_INLINE vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args)
-{
- return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...);
-}
-template <typename T, size_t N, typename Fn, size_t... Indices>
-constexpr CMT_INLINE vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
-{
- return make_vector(((void)Indices, void(), fn())...);
-}
-} // namespace internal
-
-template <typename T, size_t N, typename Fn, typename... Args,
- typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
-constexpr CMT_INLINE vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
-{
- return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>(), arg,
- std::forward<Args>(args)...);
-}
-
-template <size_t N, typename Fn, typename T = result_of<Fn()>>
-constexpr CMT_INLINE vec<T, N> apply(Fn&& fn)
-{
- return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq_t<N>());
-}
-
-#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_SIMD
-CMT_INLINE f32x4 tovec(__m128 x) { return f32x4(x); }
-CMT_INLINE f64x2 tovec(__m128d x) { return f64x2(x); }
-#endif
-
-template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
-constexpr CMT_INLINE mask<T, Nout> make_mask(bool arg, Args... args)
-{
- return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...);
-}
-KFR_FN(make_mask)
-
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector()
-{
- return vec<T, N>(czeros);
-}
-
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> zerovector(vec_t<T, N>)
-{
- return vec<T, N>(czeros);
-}
-KFR_FN(zerovector)
-
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector()
-{
- return vec<T, N>(cones);
-}
-template <typename T, size_t N>
-CMT_GNU_CONSTEXPR CMT_INLINE vec<T, N> allonesvector(vec_t<T, N>)
-{
- return vec<T, N>(cones);
-}
-KFR_FN(allonesvector)
-
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> undefinedvector()
-{
- return vec<T, N>{};
-}
-template <typename T, size_t N>
-constexpr CMT_INLINE vec<T, N> undefinedvector(vec_t<T, N>)
-{
- return undefinedvector<T, N>();
-}
-KFR_FN(undefinedvector)
-
-template <typename T, size_t N, size_t Nout /*= prev_poweroftwo(N - 1)*/>
-CMT_INLINE vec<T, Nout> low(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<Nout>());
-}
-
-template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
-CMT_INLINE vec_t<T, Nout> low(vec_t<T, N>)
-{
- return {};
-}
-
-template <typename T, size_t N, size_t Nout /*= N - prev_poweroftwo(N - 1)*/>
-CMT_INLINE vec<T, Nout> high(const vec<T, N>& x)
-{
- return x.shuffle(csizeseq_t<Nout, prev_poweroftwo(N - 1)>());
-}
-
-template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
-CMT_INLINE vec_t<T, Nout> high(vec_t<T, N>)
-{
- return {};
-}
-KFR_FN(low)
-KFR_FN(high)
-} // namespace kfr
-
-namespace cometa
-{
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::vec_t<T, N>>
-{
- constexpr static size_t width = N;
- constexpr static size_t deep_width = width * compound_type_traits<T>::width;
- using subtype = T;
- using deep_subtype = cometa::deep_subtype<T>;
- constexpr static bool is_scalar = false;
- constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
-
- template <typename U>
- using rebind = kfr::vec_t<U, N>;
- template <typename U>
- using deep_rebind = kfr::vec_t<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-};
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::vec<T, N>>
-{
- using subtype = T;
- using deep_subtype = cometa::deep_subtype<T>;
- constexpr static size_t width = N;
- constexpr static size_t deep_width = width * compound_type_traits<T>::width;
- constexpr static bool is_scalar = false;
- constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
- template <typename U>
- using rebind = kfr::vec<U, N>;
- template <typename U>
- using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-
- CMT_INLINE static constexpr subtype at(const kfr::vec<T, N>& value, size_t index) { return value[index]; }
-};
-
-template <typename T, size_t N>
-struct compound_type_traits<kfr::mask<T, N>>
-{
- using subtype = T;
- using deep_subtype = cometa::deep_subtype<T>;
- constexpr static size_t width = N;
- constexpr static size_t deep_width = width * compound_type_traits<T>::width;
- constexpr static bool is_scalar = false;
- constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
- template <typename U>
- using rebind = kfr::mask<U, N>;
- template <typename U>
- using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
-
- CMT_INLINE static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
- {
- return value[index];
- }
-};
-} // namespace cometa
-
-namespace std
-{
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<T1, N>, kfr::vec<T2, N>>
-{
- using type = kfr::vec<typename common_type<T1, T2>::type, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::vec<T1, N>, T2>
-{
- using type = kfr::vec<typename common_type<T1, T2>::type, N>;
-};
-template <typename T1, typename T2, size_t N>
-struct common_type<T1, kfr::vec<T2, N>>
-{
- using type = kfr::vec<typename common_type<T1, T2>::type, N>;
-};
-template <typename T1, typename T2, size_t N1, size_t N2>
-struct common_type<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>>
-{
- using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>;
-};
-template <typename T1, typename T2, size_t N1, size_t N2>
-struct common_type<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>>
-{
- using type = kfr::vec<kfr::vec<typename common_type<T1, T2>::type, N1>, N2>;
-};
-
-template <typename T1, typename T2, size_t N>
-struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
-{
- using type = kfr::mask<typename common_type<T1, T2>::type, N>;
-};
-} // namespace std
-
-CMT_PRAGMA_GNU(GCC diagnostic pop)
-CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -16,8 +16,10 @@ extern char* gets(char* __s);
#ifdef CMT_ARCH_X86
#if defined(_M_X64) || defined(__x86_64__)
#define CMT_ARCH_X64 1
+#define CMT_ARCH_BITNESS_NAME "64-bit"
#else
#define CMT_ARCH_X32 1
+#define CMT_ARCH_BITNESS_NAME "32-bit"
#endif
#ifndef CMT_FORCE_GENERIC_CPU
@@ -133,8 +135,10 @@ extern char* gets(char* __s);
#if defined(__aarch64__)
#define CMT_ARCH_X64 1
+#define CMT_ARCH_BITNESS_NAME "64-bit"
#else
#define CMT_ARCH_X32 1
+#define CMT_ARCH_BITNESS_NAME "32-bit"
#endif
#ifdef __ARM_NEON__
@@ -146,22 +150,22 @@ extern char* gets(char* __s);
#else
#define CMT_ARCH_NEON 1
#define CMT_ARCH_NAME neon
-#define KFR_NO_NATIVE_F64 1
+#define CMT_NO_NATIVE_F64 1
#endif
#endif
#endif
#ifndef CMT_ARCH_NAME
-#define CMT_ARCH_NAME common
+#define CMT_ARCH_NAME generic
#endif
-#ifndef KFR_NO_NATIVE_F64
-#define KFR_NATIVE_F64 1
+#ifndef CMT_NO_NATIVE_F64
+#define CMT_NATIVE_F64 1
#endif
-#ifndef KFR_NO_NATIVE_I64
-#define KFR_NATIVE_I64 1
+#ifndef CMT_NO_NATIVE_I64
+#define CMT_NATIVE_I64 1
#endif
#define CMT_STRINGIFY2(x) #x
@@ -250,28 +254,29 @@ extern char* gets(char* __s);
#define CMT_ALWAYS_INLINE
#endif
#define CMT_INLINE __inline__ CMT_ALWAYS_INLINE
-#define CMT_INTRIN CMT_INLINE CMT_NODEBUG
#define CMT_INLINE_MEMBER CMT_ALWAYS_INLINE
#define CMT_INLINE_LAMBDA CMT_INLINE_MEMBER
#define CMT_NOINLINE __attribute__((__noinline__))
#define CMT_FLATTEN __attribute__((__flatten__))
#define CMT_RESTRICT __restrict__
-#define CMT_FUNC __inline__
#elif defined(CMT_MSVC_ATTRIBUTES)
+#define CMT_ALWAYS_INLINE __forceinline
#define CMT_NODEBUG
#define CMT_INLINE /*inline*/ __forceinline
-#define CMT_INTRIN CMT_INLINE CMT_NODEBUG
#define CMT_INLINE_MEMBER __forceinline
#define CMT_INLINE_LAMBDA
#define CMT_NOINLINE __declspec(noinline)
#define CMT_FLATTEN
#define CMT_RESTRICT __restrict
-#define CMT_FUNC inline
#endif
+#define CMT_INTRINSIC CMT_INLINE CMT_NODEBUG
+#define CMT_MEM_INTRINSIC CMT_INLINE CMT_NODEBUG
+#define CMT_FUNCTION inline
+
#if defined _MSC_VER && _MSC_VER >= 1900 && \
(!defined(__clang__) || \
(defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 9))))
@@ -386,8 +391,10 @@ extern char* gets(char* __s);
#if CMT_HAS_NOEXCEPT
#define CMT_NOEXCEPT noexcept
+#define CMT_NOEXCEPT_SPEC(...) noexcept(__VA_ARGS__)
#else
#define CMT_NOEXCEPT
+#define CMT_NOEXCEPT_SPEC(...)
#endif
#if CMT_COMPILER_GNU && !defined(__EXCEPTIONS)
@@ -491,16 +498,55 @@ extern char* gets(char* __s);
#define CMT_OS_NAME "unknown"
#endif
-#if defined CMT_COMPILER_CLANG
+#if defined CMT_COMPILER_INTEL
#if defined _MSC_VER
-#define CMT_COMPIER_NAME "clang-msvc"
+#define CMT_COMPILER_NAME "intel-msvc"
+#define CMT_COMPILER_FULL_NAME \
+ "clang-msvc-" CMT_STRINGIFY(__ICL) "." CMT_STRINGIFY(__INTEL_COMPILER_UPDATE) "." CMT_STRINGIFY( \
+ __INTEL_COMPILER_BUILD_DATE)
+#else
+#define CMT_COMPILER_NAME "intel"
+#ifdef __INTEL_CLANG_COMPILER
+#define CMT_COMPILER_INTEL_SPEC "-clang"
+#ifdef __INTEL_LLVM_COMPILER
+#define CMT_COMPILER_INTEL_SPEC "-clang-llvm"
+#endif
#else
-#define CMT_COMPIER_NAME "clang"
+#ifdef __INTEL_LLVM_COMPILER
+#define CMT_COMPILER_INTEL_SPEC "-llvm"
+#else
+#define CMT_COMPILER_INTEL_SPEC ""
+#endif
+#endif
+#define CMT_COMPILER_FULL_NAME \
+ "intel-" CMT_STRINGIFY(__INTEL_COMPILER) CMT_COMPILER_INTEL_SPEC \
+ "." CMT_STRINGIFY(__INTEL_COMPILER_UPDATE) "." CMT_STRINGIFY(__INTEL_COMPILER_BUILD_DATE)
+#endif
+#elif defined CMT_COMPILER_CLANG
+#if defined _MSC_VER
+#define CMT_COMPILER_NAME "clang-msvc"
+#define CMT_COMPILER_FULL_NAME \
+ "clang-msvc-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \
+ __clang_patchlevel__)
+#else
+#define CMT_COMPILER_NAME "clang"
+#define CMT_COMPILER_FULL_NAME \
+ "clang-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \
+ __clang_patchlevel__)
#endif
#elif defined CMT_COMPILER_GCC
-#define CMT_COMPIER_NAME "gcc"
+#define CMT_COMPILER_NAME "gcc"
+#define CMT_COMPILER_FULL_NAME \
+ "gcc-" CMT_STRINGIFY(__GNUC__) "." CMT_STRINGIFY(__GNUC_MINOR__) "." CMT_STRINGIFY(__GNUC_PATCHLEVEL__)
#elif defined CMT_COMPILER_MSVC
-#define CMT_COMPIER_NAME "msvc"
+#define CMT_COMPILER_NAME "msvc"
+#define CMT_COMPILER_FULL_NAME "msvc-" CMT_STRINGIFY(_MSC_VER) "." CMT_STRINGIFY(_MSC_FULL_VER)
#else
-#define CMT_COMPIER_NAME "unknown"
+#define CMT_COMPILER_NAME "unknown"
+#define CMT_COMPILER_FULL_NAME "unknown"
#endif
+
+#define CMT_CONCAT(a, b) a##b
+
+#define CMT_NARGS2(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, ...) _10
+#define CMT_NARGS(...) CMT_NARGS2(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -8,11 +8,15 @@
#include <cstdint>
#include <cstdlib>
#include <limits>
+#include <random>
#include <type_traits>
#include <utility>
CMT_PRAGMA_GNU(GCC diagnostic push)
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunknown-warning-option")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized")
CMT_PRAGMA_MSVC(warning(push))
CMT_PRAGMA_MSVC(warning(disable : 4814))
@@ -26,13 +30,13 @@ using std::size_t;
#if __cplusplus >= 201103L || CMT_MSC_VER >= 1900 || CMT_HAS_FEATURE(cxx_constexpr)
template <typename T, size_t N>
-constexpr inline static size_t arraysize(const T (&)[N]) noexcept
+constexpr inline static size_t arraysize(const T (&)[N]) CMT_NOEXCEPT
{
return N;
}
template <typename T, size_t N>
-constexpr inline static std::integral_constant<size_t, N> carraysize(const T (&)[N]) noexcept
+constexpr inline static std::integral_constant<size_t, N> carraysize(const T (&)[N]) CMT_NOEXCEPT
{
return {};
}
@@ -173,9 +177,6 @@ using is_template_arg = std::integral_constant<bool, std::is_integral<T>::value
template <typename T>
using decay = typename std::decay<T>::type;
-template <typename... T>
-using decay_common = decay<common_type<T...>>;
-
template <typename T1, typename T2 = void, typename... Ts>
constexpr size_t typeindex()
{
@@ -253,7 +254,7 @@ namespace ops
{
struct empty
{
- constexpr empty() noexcept {}
+ constexpr empty() CMT_NOEXCEPT {}
};
} // namespace ops
@@ -261,9 +262,9 @@ template <typename T, T val>
struct cval_t : ops::empty
{
constexpr static T value = val;
- constexpr cval_t() noexcept {}
- constexpr cval_t(const cval_t&) noexcept = default;
- constexpr cval_t(cval_t&&) noexcept = default;
+ constexpr cval_t() CMT_NOEXCEPT {}
+ constexpr cval_t(const cval_t&) CMT_NOEXCEPT = default;
+ constexpr cval_t(cval_t&&) CMT_NOEXCEPT = default;
typedef T value_type;
typedef cval_t type;
constexpr operator value_type() const { return value; }
@@ -386,6 +387,8 @@ struct get_nth_type<index>
template <typename T, T... values>
struct cvals_t : ops::empty
{
+ constexpr cvals_t() CMT_NOEXCEPT = default;
+
using type = cvals_t<T, values...>;
constexpr static size_t size() { return sizeof...(values); }
template <size_t index>
@@ -413,12 +416,13 @@ struct cvals_t : ops::empty
constexpr cvals_t<T, details::get_nth_e<indices, type>::value...> operator[](
cvals_t<size_t, indices...>) const
{
+ // static_assert(sizeof(T)==0, "+++++++++++++++++++++++++++++");
return {};
}
// MSVC requires static_cast<T> here:
template <typename Fn>
- constexpr auto map(Fn&& fn) -> cvals_t<T, static_cast<T>(Fn()(values))...>
+ constexpr auto map(Fn&&) const -> cvals_t<T, static_cast<T>(Fn()(values))...>
{
return {};
}
@@ -487,6 +491,10 @@ constexpr inline T cprod(cvals_t<T, first, rest...>)
template <typename T>
struct ctype_t
{
+#ifdef CMT_COMPILER_INTEL
+ constexpr ctype_t() CMT_NOEXCEPT = default;
+ constexpr ctype_t(const ctype_t&) CMT_NOEXCEPT = default;
+#endif
using type = T;
};
@@ -510,9 +518,15 @@ struct ctypes_t
namespace details
{
-template <typename T1, typename T2>
+template <typename T1, typename... Ts>
struct concat_impl;
+template <typename T>
+struct concat_impl<T>
+{
+ using type = T;
+};
+
template <typename T, T... values1, T... values2>
struct concat_impl<cvals_t<T, values1...>, cvals_t<T, values2...>>
{
@@ -523,12 +537,19 @@ struct concat_impl<ctypes_t<types1...>, ctypes_t<types2...>>
{
using type = ctypes_t<types1..., types2...>;
};
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+struct concat_impl<T1, T2, T3, Ts...>
+{
+ using type = typename concat_impl<typename concat_impl<T1, T2>::type, T3, Ts...>::type;
+};
+
} // namespace details
-template <typename T1, typename T2>
-using concat_lists = typename details::concat_impl<T1, T2>::type;
+template <typename T1, typename... Ts>
+using concat_lists = typename details::concat_impl<decay<T1>, decay<Ts>...>::type;
-template <typename T1, typename T2>
-constexpr inline concat_lists<T1, T2> cconcat(T1, T2)
+template <typename T1, typename... Ts>
+constexpr inline concat_lists<T1, Ts...> cconcat(T1, Ts...)
{
return {};
}
@@ -584,7 +605,7 @@ template <typename Fn>
using function_result = typename details::function_arguments_impl<decltype(&Fn::operator())>::result;
template <typename T1, typename T2>
-using cfilter_t = typename details::filter_impl<T1, T2>::type;
+using cfilter_t = typename details::filter_impl<decay<T1>, decay<T2>>::type;
template <typename T, T... vals, bool... flags,
typename Ret = cfilter_t<cvals_t<T, vals...>, cvals_t<bool, flags...>>>
@@ -659,15 +680,13 @@ CMT_BIN_OP(^)
namespace details
{
-template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep>
-struct cvalseq_impl;
-
-template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep>
-using cgen_seq = typename cvalseq_impl<T, Nsize, Nstart, Nstep>::type;
template <typename T, size_t Nsize, T Nstart, ptrdiff_t Nstep>
-struct cvalseq_impl : concat_impl<cgen_seq<T, Nsize / 2, Nstart, Nstep>,
- cgen_seq<T, Nsize - Nsize / 2, Nstart + (Nsize / 2) * Nstep, Nstep>>
+struct cvalseq_impl
+ : concat_impl<typename cvalseq_impl<T, Nsize / 2, Nstart, Nstep>::type,
+ typename cvalseq_impl<T, Nsize - Nsize / 2,
+ static_cast<T>(Nstart + static_cast<ptrdiff_t>(Nsize / 2) * Nstep),
+ Nstep>::type>
{
};
@@ -679,6 +698,10 @@ template <typename T, T Nstart, ptrdiff_t Nstep>
struct cvalseq_impl<T, 1, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart)>
{
};
+template <typename T, T Nstart, ptrdiff_t Nstep>
+struct cvalseq_impl<T, 2, Nstart, Nstep> : cvals_t<T, static_cast<T>(Nstart), static_cast<T>(Nstart + Nstep)>
+{
+};
} // namespace details
template <typename T, size_t size, T start = T(), ptrdiff_t step = 1>
@@ -691,9 +714,11 @@ template <typename... List>
using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>;
template <size_t group, size_t... indices, size_t N = group * sizeof...(indices)>
-constexpr inline auto scale(csizes_t<indices...> i) noexcept
+constexpr inline auto scale(csizes_t<indices...> i) CMT_NOEXCEPT
{
- return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() % csize_t<group>();
+ return cconcat(csizeseq_t<group, group * indices>()...);
+ // return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() %
+ // csize_t<group>();
}
namespace details
@@ -814,12 +839,14 @@ constexpr inline unsigned ilog2(T n, unsigned p = 0)
return (n <= 1) ? p : ilog2(n / 2, p + 1);
}
+/// @brief Returns a nearest power of two that is greater or equal than n
template <typename T>
constexpr inline T next_poweroftwo(T n)
{
return n > 2 ? T(1) << (ilog2(n - 1) + 1) : n;
}
+/// @brief Returns a nearest power of two that is less or equal than n
template <typename T>
constexpr inline T prev_poweroftwo(T n)
{
@@ -1007,7 +1034,7 @@ template <>
constexpr size_t elementsize<void>()
{
return 1;
-};
+}
} // namespace details
/// @brief Utility typedef used to disable type deduction
@@ -1018,7 +1045,7 @@ using identity = typename details::identity_impl<T>::type;
struct swallow
{
template <typename... T>
- CMT_INTRIN constexpr swallow(T&&...) noexcept
+ CMT_MEM_INTRINSIC constexpr swallow(T&&...) CMT_NOEXCEPT
{
}
};
@@ -1029,52 +1056,52 @@ struct carray;
template <typename T>
struct carray<T, 1>
{
- CMT_INTRIN constexpr carray() noexcept = default;
- CMT_INTRIN constexpr carray(T val) noexcept : val(val) {}
+ CMT_MEM_INTRINSIC constexpr carray() CMT_NOEXCEPT = default;
+ CMT_MEM_INTRINSIC constexpr carray(T val) CMT_NOEXCEPT : val(val) {}
template <typename Fn, size_t index = 0, CMT_ENABLE_IF(is_callable<Fn, csize_t<index>>::value)>
- CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept
+ CMT_MEM_INTRINSIC constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) CMT_NOEXCEPT
: val(static_cast<T>(fn(csize_t<index>())))
{
}
- CMT_INTRIN constexpr carray(const carray&) noexcept = default;
- CMT_INTRIN constexpr carray(carray&&) noexcept = default;
- CMT_INTRIN static constexpr size_t size() noexcept { return 1; }
+ CMT_MEM_INTRINSIC constexpr carray(const carray&) CMT_NOEXCEPT = default;
+ CMT_MEM_INTRINSIC constexpr carray(carray&&) CMT_NOEXCEPT = default;
+ CMT_MEM_INTRINSIC static constexpr size_t size() CMT_NOEXCEPT { return 1; }
template <size_t index>
- CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
+ CMT_MEM_INTRINSIC constexpr T& get(csize_t<index>) CMT_NOEXCEPT
{
static_assert(index == 0, "carray: Array index is out of range");
return val;
}
template <size_t index>
- CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
+ CMT_MEM_INTRINSIC constexpr const T& get(csize_t<index>) const CMT_NOEXCEPT
{
static_assert(index == 0, "carray: Array index is out of range");
return val;
}
template <size_t index>
- CMT_INTRIN constexpr T& get() noexcept
+ CMT_MEM_INTRINSIC constexpr T& get() CMT_NOEXCEPT
{
return get(csize_t<index>());
}
template <size_t index>
- CMT_INTRIN constexpr const T& get() const noexcept
+ CMT_MEM_INTRINSIC constexpr const T& get() const CMT_NOEXCEPT
{
return get(csize_t<index>());
}
- CMT_INTRIN constexpr const T* front() const noexcept { return val; }
- CMT_INTRIN constexpr T* front() noexcept { return val; }
- CMT_INTRIN constexpr const T* back() const noexcept { return val; }
- CMT_INTRIN constexpr T* back() noexcept { return val; }
- CMT_INTRIN constexpr const T* begin() const noexcept { return &val; }
- CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; }
- CMT_INTRIN constexpr T* begin() noexcept { return &val; }
- CMT_INTRIN constexpr T* end() noexcept { return &val + 1; }
- CMT_INTRIN constexpr const T* data() const noexcept { return begin(); }
- CMT_INTRIN constexpr T* data() noexcept { return begin(); }
- CMT_INTRIN constexpr bool empty() const noexcept { return false; }
+ CMT_MEM_INTRINSIC constexpr const T* front() const CMT_NOEXCEPT { return val; }
+ CMT_MEM_INTRINSIC constexpr T* front() CMT_NOEXCEPT { return val; }
+ CMT_MEM_INTRINSIC constexpr const T* back() const CMT_NOEXCEPT { return val; }
+ CMT_MEM_INTRINSIC constexpr T* back() CMT_NOEXCEPT { return val; }
+ CMT_MEM_INTRINSIC constexpr const T* begin() const CMT_NOEXCEPT { return &val; }
+ CMT_MEM_INTRINSIC constexpr const T* end() const CMT_NOEXCEPT { return &val + 1; }
+ CMT_MEM_INTRINSIC constexpr T* begin() CMT_NOEXCEPT { return &val; }
+ CMT_MEM_INTRINSIC constexpr T* end() CMT_NOEXCEPT { return &val + 1; }
+ CMT_MEM_INTRINSIC constexpr const T* data() const CMT_NOEXCEPT { return begin(); }
+ CMT_MEM_INTRINSIC constexpr T* data() CMT_NOEXCEPT { return begin(); }
+ CMT_MEM_INTRINSIC constexpr bool empty() const CMT_NOEXCEPT { return false; }
T val;
};
@@ -1082,55 +1109,56 @@ template <typename T, size_t N>
struct carray : carray<T, N - 1>
{
template <typename... Ts>
- CMT_INTRIN constexpr carray(T first, Ts... list) noexcept : carray<T, N - 1>(list...), val(first)
+ CMT_MEM_INTRINSIC constexpr carray(T first, Ts... list) CMT_NOEXCEPT : carray<T, N - 1>(list...),
+ val(first)
{
static_assert(sizeof...(list) + 1 == N, "carray: Argument count is invalid");
}
template <typename Fn, size_t index = N - 1>
- CMT_INTRIN constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) noexcept
+ CMT_MEM_INTRINSIC constexpr carray(Fn&& fn, csize_t<index> = csize_t<index>{}) CMT_NOEXCEPT
: carray<T, N - 1>(std::forward<Fn>(fn), csize_t<index - 1>()),
val(static_cast<T>(fn(csize_t<index>())))
{
}
- CMT_INTRIN constexpr carray() noexcept = default;
- CMT_INTRIN constexpr carray(const carray&) noexcept = default;
- CMT_INTRIN constexpr carray(carray&&) noexcept = default;
- CMT_INTRIN static constexpr size_t size() noexcept { return N; }
- CMT_INTRIN constexpr T& get(csize_t<N - 1>) noexcept { return val; }
+ CMT_MEM_INTRINSIC constexpr carray() CMT_NOEXCEPT = default;
+ CMT_MEM_INTRINSIC constexpr carray(const carray&) CMT_NOEXCEPT = default;
+ CMT_MEM_INTRINSIC constexpr carray(carray&&) CMT_NOEXCEPT = default;
+ CMT_MEM_INTRINSIC static constexpr size_t size() CMT_NOEXCEPT { return N; }
+ CMT_MEM_INTRINSIC constexpr T& get(csize_t<N - 1>) CMT_NOEXCEPT { return val; }
template <size_t index>
- CMT_INTRIN constexpr T& get(csize_t<index>) noexcept
+ CMT_MEM_INTRINSIC constexpr T& get(csize_t<index>) CMT_NOEXCEPT
{
return carray<T, N - 1>::get(csize_t<index>());
}
- CMT_INTRIN constexpr const T& get(csize_t<N - 1>) const noexcept { return val; }
+ CMT_MEM_INTRINSIC constexpr const T& get(csize_t<N - 1>) const CMT_NOEXCEPT { return val; }
template <size_t index>
- CMT_INTRIN constexpr const T& get(csize_t<index>) const noexcept
+ CMT_MEM_INTRINSIC constexpr const T& get(csize_t<index>) const CMT_NOEXCEPT
{
return carray<T, N - 1>::get(csize_t<index>());
}
template <size_t index>
- CMT_INTRIN constexpr T& get() noexcept
+ CMT_MEM_INTRINSIC constexpr T& get() CMT_NOEXCEPT
{
return get(csize_t<index>());
}
template <size_t index>
- CMT_INTRIN constexpr const T& get() const noexcept
+ CMT_MEM_INTRINSIC constexpr const T& get() const CMT_NOEXCEPT
{
return get(csize_t<index>());
}
- CMT_INTRIN constexpr const T* front() const noexcept { return carray<T, N - 1>::front(); }
- CMT_INTRIN constexpr T* front() noexcept { return carray<T, N - 1>::front(); }
- CMT_INTRIN constexpr const T* back() const noexcept { return val; }
- CMT_INTRIN constexpr T* back() noexcept { return val; }
- CMT_INTRIN constexpr const T* begin() const noexcept { return carray<T, N - 1>::begin(); }
- CMT_INTRIN constexpr const T* end() const noexcept { return &val + 1; }
- CMT_INTRIN constexpr T* begin() noexcept { return carray<T, N - 1>::begin(); }
- CMT_INTRIN constexpr T* end() noexcept { return &val + 1; }
- CMT_INTRIN constexpr const T* data() const noexcept { return begin(); }
- CMT_INTRIN constexpr T* data() noexcept { return begin(); }
- CMT_INTRIN constexpr bool empty() const noexcept { return false; }
+ CMT_MEM_INTRINSIC constexpr const T* front() const CMT_NOEXCEPT { return carray<T, N - 1>::front(); }
+ CMT_MEM_INTRINSIC constexpr T* front() CMT_NOEXCEPT { return carray<T, N - 1>::front(); }
+ CMT_MEM_INTRINSIC constexpr const T* back() const CMT_NOEXCEPT { return val; }
+ CMT_MEM_INTRINSIC constexpr T* back() CMT_NOEXCEPT { return val; }
+ CMT_MEM_INTRINSIC constexpr const T* begin() const CMT_NOEXCEPT { return carray<T, N - 1>::begin(); }
+ CMT_MEM_INTRINSIC constexpr const T* end() const CMT_NOEXCEPT { return &val + 1; }
+ CMT_MEM_INTRINSIC constexpr T* begin() CMT_NOEXCEPT { return carray<T, N - 1>::begin(); }
+ CMT_MEM_INTRINSIC constexpr T* end() CMT_NOEXCEPT { return &val + 1; }
+ CMT_MEM_INTRINSIC constexpr const T* data() const CMT_NOEXCEPT { return begin(); }
+ CMT_MEM_INTRINSIC constexpr T* data() CMT_NOEXCEPT { return begin(); }
+ CMT_MEM_INTRINSIC constexpr bool empty() const CMT_NOEXCEPT { return false; }
private:
T val;
@@ -1162,45 +1190,52 @@ private:
/// @brief Function that returns its first argument
template <typename T>
-CMT_INTRIN constexpr T&& pass_through(T&& x) noexcept
+CMT_INTRINSIC constexpr T&& pass_through(T&& x) CMT_NOEXCEPT
{
return std::forward<T>(x);
}
/// @brief Function that returns void and ignores all its arguments
template <typename... Ts>
-CMT_INTRIN constexpr void noop(Ts&&...) noexcept
+CMT_INTRINSIC constexpr void noop(Ts&&...) CMT_NOEXCEPT
{
}
/// @brief Function that returns its first argument and ignores all other arguments
template <typename T1, typename... Ts>
-CMT_INTRIN constexpr T1&& get_first(T1&& x, Ts&&...) noexcept
+CMT_INTRINSIC constexpr T1&& get_first(T1&& x, Ts&&...) CMT_NOEXCEPT
{
return std::forward<T1>(x);
}
/// @brief Function that returns its second argument and ignores all other arguments
template <typename T1, typename T2, typename... Ts>
-CMT_INTRIN constexpr T2&& get_second(T1, T2&& x, Ts&&...) noexcept
+CMT_INTRINSIC constexpr T2&& get_second(T1, T2&& x, Ts&&...) CMT_NOEXCEPT
{
return std::forward<T2>(x);
}
/// @brief Function that returns its third argument and ignores all other arguments
template <typename T1, typename T2, typename T3, typename... Ts>
-CMT_INTRIN constexpr T3&& get_third(T1&&, T2&&, T3&& x, Ts&&...) noexcept
+CMT_INTRINSIC constexpr T3&& get_third(T1&&, T2&&, T3&& x, Ts&&...) CMT_NOEXCEPT
{
return std::forward<T3>(x);
}
/// @brief Function that returns value-initialization of type T and ignores all its arguments
template <typename T, typename... Ts>
-CMT_INTRIN constexpr T returns(Ts&&...)
+CMT_INTRINSIC constexpr T returns(Ts&&...)
{
return T();
}
+/// @brief Function that returns constant of type T and ignores all its arguments
+template <typename T, T value, typename... Args>
+CMT_INTRINSIC constexpr T return_constant(Args&&...)
+{
+ return value;
+}
+
CMT_FN(pass_through)
CMT_FN(noop)
CMT_FN(get_first)
@@ -1208,33 +1243,43 @@ CMT_FN(get_second)
CMT_FN(get_third)
CMT_FN_TPL((typename T), (T), returns)
+template <typename T, T value>
+struct fn_return_constant
+{
+ template <typename... Args>
+ constexpr T operator()(Args&&...) const noexcept
+ {
+ return value;
+ }
+};
+
template <typename T1, typename T2>
-CMT_INTRIN bool is_equal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_equal(const T1& x, const T2& y)
{
return x == y;
}
template <typename T1, typename T2>
-CMT_INTRIN bool is_notequal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_notequal(const T1& x, const T2& y)
{
return x != y;
}
template <typename T1, typename T2>
-CMT_INTRIN bool is_less(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_less(const T1& x, const T2& y)
{
return x < y;
}
template <typename T1, typename T2>
-CMT_INTRIN bool is_greater(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_greater(const T1& x, const T2& y)
{
return x > y;
}
template <typename T1, typename T2>
-CMT_INTRIN bool is_lessorequal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_lessorequal(const T1& x, const T2& y)
{
return x <= y;
}
template <typename T1, typename T2>
-CMT_INTRIN bool is_greaterorequal(const T1& x, const T2& y)
+CMT_INTRINSIC bool is_greaterorequal(const T1& x, const T2& y)
{
return x >= y;
}
@@ -1313,7 +1358,7 @@ void cforeach_impl(Fn&& fn)
#endif
template <typename T, T... values, typename Fn>
-CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn)
+CMT_INTRINSIC void cforeach(cvals_t<T, values...>, Fn&& fn)
{
#ifdef CMT_COMPILER_CLANG
swallow{ (fn(cval_t<T, values>()), void(), 0)... };
@@ -1323,7 +1368,7 @@ CMT_INTRIN void cforeach(cvals_t<T, values...>, Fn&& fn)
}
template <typename T, typename Fn, CMT_ENABLE_IF(has_begin_end<T>::value)>
-CMT_INTRIN void cforeach(T&& list, Fn&& fn)
+CMT_INTRINSIC void cforeach(T&& list, Fn&& fn)
{
for (const auto& v : list)
{
@@ -1332,7 +1377,7 @@ CMT_INTRIN void cforeach(T&& list, Fn&& fn)
}
template <typename T, size_t N, typename Fn>
-CMT_INTRIN void cforeach(const T (&array)[N], Fn&& fn)
+CMT_INTRINSIC void cforeach(const T (&array)[N], Fn&& fn)
{
for (size_t i = 0; i < N; i++)
{
@@ -1344,59 +1389,94 @@ namespace details
{
template <size_t index, typename... types>
-CMT_INTRIN auto get_type_arg(ctypes_t<types...>)
+CMT_INTRINSIC auto get_type_arg(ctypes_t<types...>)
{
return ctype_t<type_of<details::get_nth_type<index, types...>>>();
}
template <typename T0, typename... types, typename Fn, size_t... indices>
-CMT_INTRIN void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, csizes_t<indices...>)
+CMT_INTRINSIC void cforeach_types_impl(ctypes_t<T0, types...> type_list, Fn&& fn, csizes_t<indices...>)
{
swallow{ (fn(get_type_arg<indices>(type_list)), void(), 0)... };
}
+template <typename Fn>
+CMT_INTRINSIC void cforeach_types_impl(ctypes_t<>, Fn&&, csizes_t<>)
+{
+}
} // namespace details
template <typename... Ts, typename Fn>
-CMT_INTRIN void cforeach(ctypes_t<Ts...> types, Fn&& fn)
+CMT_INTRINSIC void cforeach(ctypes_t<Ts...> types, Fn&& fn)
{
details::cforeach_types_impl(types, std::forward<Fn>(fn), csizeseq_t<sizeof...(Ts)>());
}
template <typename A0, typename A1, typename Fn>
-CMT_INTRIN void cforeach(A0&& a0, A1&& a1, Fn&& fn)
+CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, Fn&& fn)
{
- cforeach(std::forward<A0>(a0),
- [&](auto v0) { cforeach(std::forward<A1>(a1), [&](auto v1) { fn(v0, v1); }); });
+ // Default capture causes ICE in Intel C++
+ cforeach(std::forward<A0>(a0), //
+ [&a1, &fn](auto v0) { //
+ cforeach(std::forward<A1>(a1), //
+ [&v0, &fn](auto v1) { fn(v0, v1); });
+ });
}
template <typename A0, typename A1, typename A2, typename Fn>
-CMT_INTRIN void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
-{
- cforeach(std::forward<A0>(a0), [&](auto v0) {
- cforeach(std::forward<A1>(a1),
- [&](auto v1) { cforeach(std::forward<A2>(a2), [&](auto v2) { fn(v0, v1, v2); }); });
- });
+CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, A2&& a2, Fn&& fn)
+{
+ // Default capture causes ICE in Intel C++
+ cforeach(std::forward<A0>(a0), //
+ [&a1, &a2, &fn](auto v0) { //
+ cforeach(std::forward<A1>(a1), //
+ [&v0, &a2, &fn](auto v1) { //
+ cforeach(std::forward<A2>(a2), //
+ [&v0, &v1, &fn](auto v2) { //
+ fn(v0, v1, v2);
+ });
+ });
+ });
+}
+
+template <typename A0, typename A1, typename A2, typename A3, typename Fn>
+CMT_INTRINSIC void cforeach(A0&& a0, A1&& a1, A2&& a2, A3&& a3, Fn&& fn)
+{
+ // Default capture causes ICE in Intel C++
+ cforeach(std::forward<A0>(a0), //
+ [&a1, &a2, &a3, &fn](auto v0) { //
+ cforeach(std::forward<A1>(a1), //
+ [&v0, &a2, &a3, &fn](auto v1) { //
+ cforeach(std::forward<A2>(a2), //
+ [&v0, &v1, &a3, &fn](auto v2) { //
+ cforeach(std::forward<A3>(a3), //
+ [&v0, &v1, &v2, &fn](auto v3) //
+ { fn(v0, v1, v2, v3); });
+ });
+ });
+ });
}
+
template <typename TrueFn, typename FalseFn = fn_noop>
-CMT_INTRIN decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
+CMT_INTRINSIC decltype(auto) cif(cbool_t<true>, TrueFn&& truefn, FalseFn&& = FalseFn())
{
return truefn(ctrue);
}
template <typename TrueFn, typename FalseFn = fn_noop>
-CMT_INTRIN decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
+CMT_INTRINSIC decltype(auto) cif(cbool_t<false>, TrueFn&&, FalseFn&& falsefn = FalseFn())
{
return falsefn(cfalse);
}
template <typename T, T start, T stop, typename BodyFn>
-CMT_INTRIN decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
+CMT_INTRINSIC decltype(auto) cfor(cval_t<T, start>, cval_t<T, stop>, BodyFn&& bodyfn)
{
return cforeach(cvalseq_t<T, stop - start, start>(), std::forward<BodyFn>(bodyfn));
}
template <typename T, T... vs, typename U, typename Function, typename Fallback = fn_noop>
-void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&& fallback = Fallback())
+CMT_INTRINSIC void cswitch(cvals_t<T, vs...>, const U& value, Function&& function,
+ Fallback&& fallback = Fallback())
{
bool result = false;
swallow{ (result = result || ((vs == value) ? (function(cval_t<T, vs>()), void(), true) : false), void(),
@@ -1406,14 +1486,15 @@ void cswitch(cvals_t<T, vs...>, const U& value, Function&& function, Fallback&&
}
template <typename T, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal>
-CMT_INTRIN decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(), CmpFn&& = CmpFn())
+CMT_INTRINSIC decltype(auto) cswitch(cvals_t<T>, identity<T>, Fn&&, DefFn&& deffn = DefFn(),
+ CmpFn&& = CmpFn())
{
return deffn();
}
template <typename T, T v0, T... values, typename Fn, typename DefFn = fn_noop, typename CmpFn = fn_is_equal>
-CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn,
- DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn())
+CMT_INTRINSIC decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value, Fn&& fn,
+ DefFn&& deffn = DefFn(), CmpFn&& cmpfn = CmpFn())
{
if (cmpfn(value, v0))
{
@@ -1428,7 +1509,6 @@ CMT_INTRIN decltype(auto) cswitch(cvals_t<T, v0, values...>, identity<T> value,
namespace details
{
-
template <typename T, typename Fn1, typename Fn2, typename... Fns>
inline decltype(auto) cmatch_impl(T&& value, Fn1&& first, Fn2&& second, Fns&&... rest);
template <typename T, typename Fn, typename... Ts>
@@ -1491,15 +1571,15 @@ template <typename Fn>
struct fn_noinline
{
template <typename... Args>
- CMT_INTRIN result_of<Fn(Args...)> operator()(Args&&... args) const
+ CMT_MEM_INTRINSIC result_of<Fn(Args...)> operator()(Args&&... args) const
{
return noinline(Fn{}, std::forward<Args>(args)...);
}
-};
+}; // namespace cometa
template <typename... Args, typename Fn, typename Ret = decltype(std::declval<Fn>()(std::declval<Args>()...)),
typename NonMemFn = Ret (*)(Fn*, Args...)>
-CMT_INTRIN NonMemFn make_nonmember(const Fn&)
+CMT_INTRINSIC NonMemFn make_nonmember(const Fn&)
{
return [](Fn* fn, Args... args) -> Ret { return fn->operator()(std::forward<Args>(args)...); };
}
@@ -1510,6 +1590,11 @@ constexpr inline T choose_const()
static_assert(sizeof(T) != 0, "T not found in the list of template arguments");
return T();
}
+template <typename T, typename C1>
+constexpr inline T choose_const_fallback(C1 c1)
+{
+ return static_cast<T>(c1);
+}
/**
* Selects constant of the specific type
@@ -1518,10 +1603,21 @@ constexpr inline T choose_const()
* CHECK( choose_const<f64>( 32.0f, 64.0 ) == 64.0 );
* @endcode
*/
+template <typename T, typename C1, typename... Cs, CMT_ENABLE_IF(std::is_same<T, C1>::value)>
+constexpr inline T choose_const(C1 c1, Cs...)
+{
+ return static_cast<T>(c1);
+}
+template <typename T, typename C1, typename... Cs, CMT_ENABLE_IF(!std::is_same<T, C1>::value)>
+constexpr inline T choose_const(C1, Cs... constants)
+{
+ return choose_const<T>(constants...);
+}
+
template <typename T, typename C1, typename... Cs>
-constexpr inline T choose_const(C1 c1, Cs... constants)
+constexpr inline T choose_const_fallback(C1 c1, Cs... constants)
{
- return std::is_same<T, C1>::value ? static_cast<T>(c1) : choose_const<T>(constants...);
+ return std::is_same<T, C1>::value ? static_cast<T>(c1) : choose_const_fallback<T>(constants...);
}
template <typename Tfrom>
@@ -1529,14 +1625,14 @@ struct autocast_impl
{
const Tfrom value;
template <typename T>
- CMT_INTRIN constexpr operator T() const noexcept
+ CMT_MEM_INTRINSIC constexpr operator T() const CMT_NOEXCEPT
{
return static_cast<T>(value);
}
};
template <typename Tfrom>
-CMT_INTRIN constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) noexcept
+CMT_INTRINSIC constexpr autocast_impl<Tfrom> autocast(const Tfrom& value) CMT_NOEXCEPT
{
return { value };
}
@@ -1603,49 +1699,49 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wundefined-reinterpret-cast")
#endif
template <typename T, typename U>
-CMT_INLINE constexpr static T& ref_cast(U& ptr)
+CMT_INTRINSIC constexpr static T& ref_cast(U& ptr)
{
return reinterpret_cast<T&>(ptr);
}
template <typename T, typename U>
-CMT_INLINE constexpr static const T& ref_cast(const U& ptr)
+CMT_INTRINSIC constexpr static const T& ref_cast(const U& ptr)
{
return reinterpret_cast<const T&>(ptr);
}
template <typename T, typename U>
-CMT_INLINE constexpr static T* ptr_cast(U* ptr)
+CMT_INTRINSIC constexpr static T* ptr_cast(U* ptr)
{
return reinterpret_cast<T*>(ptr);
}
template <typename T, typename U>
-CMT_INLINE constexpr static const T* ptr_cast(const U* ptr)
+CMT_INTRINSIC constexpr static const T* ptr_cast(const U* ptr)
{
return reinterpret_cast<const T*>(ptr);
}
template <typename T, typename U>
-CMT_INLINE constexpr static T* ptr_cast(U* ptr, ptrdiff_t offset)
+CMT_INTRINSIC constexpr static T* ptr_cast(U* ptr, ptrdiff_t offset)
{
return ptr_cast<T>(ptr_cast<unsigned char>(ptr) + offset);
}
template <typename T, typename U>
-CMT_INLINE constexpr static T* derived_cast(U* ptr)
+CMT_INTRINSIC constexpr static T* derived_cast(U* ptr)
{
return static_cast<T*>(ptr);
}
template <typename T, typename U>
-CMT_INLINE constexpr static const T* derived_cast(const U* ptr)
+CMT_INTRINSIC constexpr static const T* derived_cast(const U* ptr)
{
return static_cast<const T*>(ptr);
}
template <typename T, typename U>
-CMT_INLINE constexpr static T implicit_cast(U&& value)
+CMT_INTRINSIC constexpr static T implicit_cast(U&& value)
{
return std::forward<T>(value);
}
@@ -1751,6 +1847,228 @@ constexpr conditional<std::is_scalar<T>::value, T, const T&> const_min(const T&
return x < y ? x : y;
}
+template <int n = 10>
+struct overload_priority : overload_priority<n - 1>
+{
+};
+
+template <>
+struct overload_priority<0>
+{
+};
+
+constexpr overload_priority<> overload_auto{};
+
+using overload_generic = overload_priority<0>;
+
+#define CMT_GEN_LIST1(m, ...) m(0, __VA_ARGS__)
+#define CMT_GEN_LIST2(m, ...) CMT_GEN_LIST1(m, __VA_ARGS__), m(1, __VA_ARGS__)
+#define CMT_GEN_LIST3(m, ...) CMT_GEN_LIST2(m, __VA_ARGS__), m(2, __VA_ARGS__)
+#define CMT_GEN_LIST4(m, ...) CMT_GEN_LIST3(m, __VA_ARGS__), m(3, __VA_ARGS__)
+#define CMT_GEN_LIST5(m, ...) CMT_GEN_LIST4(m, __VA_ARGS__), m(4, __VA_ARGS__)
+#define CMT_GEN_LIST6(m, ...) CMT_GEN_LIST5(m, __VA_ARGS__), m(5, __VA_ARGS__)
+#define CMT_GEN_LIST7(m, ...) CMT_GEN_LIST6(m, __VA_ARGS__), m(6, __VA_ARGS__)
+#define CMT_GEN_LIST8(m, ...) CMT_GEN_LIST7(m, __VA_ARGS__), m(7, __VA_ARGS__)
+#define CMT_GEN_LIST9(m, ...) CMT_GEN_LIST8(m, __VA_ARGS__), m(8, __VA_ARGS__)
+#define CMT_GEN_LIST10(m, ...) CMT_GEN_LIST9(m, __VA_ARGS__), m(9, __VA_ARGS__)
+
+#define CMT_GEN_LIST11(m, ...) CMT_GEN_LIST10(m, __VA_ARGS__), m(10, __VA_ARGS__)
+#define CMT_GEN_LIST12(m, ...) CMT_GEN_LIST11(m, __VA_ARGS__), m(11, __VA_ARGS__)
+#define CMT_GEN_LIST13(m, ...) CMT_GEN_LIST12(m, __VA_ARGS__), m(12, __VA_ARGS__)
+#define CMT_GEN_LIST14(m, ...) CMT_GEN_LIST13(m, __VA_ARGS__), m(13, __VA_ARGS__)
+#define CMT_GEN_LIST15(m, ...) CMT_GEN_LIST14(m, __VA_ARGS__), m(14, __VA_ARGS__)
+#define CMT_GEN_LIST16(m, ...) CMT_GEN_LIST15(m, __VA_ARGS__), m(15, __VA_ARGS__)
+#define CMT_GEN_LIST17(m, ...) CMT_GEN_LIST16(m, __VA_ARGS__), m(16, __VA_ARGS__)
+#define CMT_GEN_LIST18(m, ...) CMT_GEN_LIST17(m, __VA_ARGS__), m(17, __VA_ARGS__)
+#define CMT_GEN_LIST19(m, ...) CMT_GEN_LIST18(m, __VA_ARGS__), m(18, __VA_ARGS__)
+#define CMT_GEN_LIST20(m, ...) CMT_GEN_LIST19(m, __VA_ARGS__), m(19, __VA_ARGS__)
+
+#define CMT_GEN_LIST21(m, ...) CMT_GEN_LIST20(m, __VA_ARGS__), m(20, __VA_ARGS__)
+#define CMT_GEN_LIST22(m, ...) CMT_GEN_LIST21(m, __VA_ARGS__), m(21, __VA_ARGS__)
+#define CMT_GEN_LIST23(m, ...) CMT_GEN_LIST22(m, __VA_ARGS__), m(22, __VA_ARGS__)
+#define CMT_GEN_LIST24(m, ...) CMT_GEN_LIST23(m, __VA_ARGS__), m(23, __VA_ARGS__)
+#define CMT_GEN_LIST25(m, ...) CMT_GEN_LIST24(m, __VA_ARGS__), m(24, __VA_ARGS__)
+#define CMT_GEN_LIST26(m, ...) CMT_GEN_LIST25(m, __VA_ARGS__), m(25, __VA_ARGS__)
+#define CMT_GEN_LIST27(m, ...) CMT_GEN_LIST26(m, __VA_ARGS__), m(26, __VA_ARGS__)
+#define CMT_GEN_LIST28(m, ...) CMT_GEN_LIST27(m, __VA_ARGS__), m(27, __VA_ARGS__)
+#define CMT_GEN_LIST29(m, ...) CMT_GEN_LIST28(m, __VA_ARGS__), m(28, __VA_ARGS__)
+#define CMT_GEN_LIST30(m, ...) CMT_GEN_LIST29(m, __VA_ARGS__), m(29, __VA_ARGS__)
+
+#define CMT_GEN_LIST31(m, ...) CMT_GEN_LIST30(m, __VA_ARGS__), m(30, __VA_ARGS__)
+#define CMT_GEN_LIST32(m, ...) CMT_GEN_LIST31(m, __VA_ARGS__), m(31, __VA_ARGS__)
+#define CMT_GEN_LIST33(m, ...) CMT_GEN_LIST32(m, __VA_ARGS__), m(32, __VA_ARGS__)
+#define CMT_GEN_LIST34(m, ...) CMT_GEN_LIST33(m, __VA_ARGS__), m(33, __VA_ARGS__)
+#define CMT_GEN_LIST35(m, ...) CMT_GEN_LIST34(m, __VA_ARGS__), m(34, __VA_ARGS__)
+#define CMT_GEN_LIST36(m, ...) CMT_GEN_LIST35(m, __VA_ARGS__), m(35, __VA_ARGS__)
+#define CMT_GEN_LIST37(m, ...) CMT_GEN_LIST36(m, __VA_ARGS__), m(36, __VA_ARGS__)
+#define CMT_GEN_LIST38(m, ...) CMT_GEN_LIST37(m, __VA_ARGS__), m(37, __VA_ARGS__)
+#define CMT_GEN_LIST39(m, ...) CMT_GEN_LIST38(m, __VA_ARGS__), m(38, __VA_ARGS__)
+#define CMT_GEN_LIST40(m, ...) CMT_GEN_LIST39(m, __VA_ARGS__), m(39, __VA_ARGS__)
+
+#define CMT_GEN_LIST41(m, ...) CMT_GEN_LIST40(m, __VA_ARGS__), m(40, __VA_ARGS__)
+#define CMT_GEN_LIST42(m, ...) CMT_GEN_LIST41(m, __VA_ARGS__), m(41, __VA_ARGS__)
+#define CMT_GEN_LIST43(m, ...) CMT_GEN_LIST42(m, __VA_ARGS__), m(42, __VA_ARGS__)
+#define CMT_GEN_LIST44(m, ...) CMT_GEN_LIST43(m, __VA_ARGS__), m(43, __VA_ARGS__)
+#define CMT_GEN_LIST45(m, ...) CMT_GEN_LIST44(m, __VA_ARGS__), m(44, __VA_ARGS__)
+#define CMT_GEN_LIST46(m, ...) CMT_GEN_LIST45(m, __VA_ARGS__), m(45, __VA_ARGS__)
+#define CMT_GEN_LIST47(m, ...) CMT_GEN_LIST46(m, __VA_ARGS__), m(46, __VA_ARGS__)
+#define CMT_GEN_LIST48(m, ...) CMT_GEN_LIST47(m, __VA_ARGS__), m(47, __VA_ARGS__)
+#define CMT_GEN_LIST49(m, ...) CMT_GEN_LIST48(m, __VA_ARGS__), m(48, __VA_ARGS__)
+#define CMT_GEN_LIST50(m, ...) CMT_GEN_LIST49(m, __VA_ARGS__), m(49, __VA_ARGS__)
+
+#define CMT_GEN_LIST51(m, ...) CMT_GEN_LIST50(m, __VA_ARGS__), m(50, __VA_ARGS__)
+#define CMT_GEN_LIST52(m, ...) CMT_GEN_LIST51(m, __VA_ARGS__), m(51, __VA_ARGS__)
+#define CMT_GEN_LIST53(m, ...) CMT_GEN_LIST52(m, __VA_ARGS__), m(52, __VA_ARGS__)
+#define CMT_GEN_LIST54(m, ...) CMT_GEN_LIST53(m, __VA_ARGS__), m(53, __VA_ARGS__)
+#define CMT_GEN_LIST55(m, ...) CMT_GEN_LIST54(m, __VA_ARGS__), m(54, __VA_ARGS__)
+#define CMT_GEN_LIST56(m, ...) CMT_GEN_LIST55(m, __VA_ARGS__), m(55, __VA_ARGS__)
+#define CMT_GEN_LIST57(m, ...) CMT_GEN_LIST56(m, __VA_ARGS__), m(56, __VA_ARGS__)
+#define CMT_GEN_LIST58(m, ...) CMT_GEN_LIST57(m, __VA_ARGS__), m(57, __VA_ARGS__)
+#define CMT_GEN_LIST59(m, ...) CMT_GEN_LIST58(m, __VA_ARGS__), m(58, __VA_ARGS__)
+#define CMT_GEN_LIST60(m, ...) CMT_GEN_LIST59(m, __VA_ARGS__), m(59, __VA_ARGS__)
+
+#define CMT_GEN_LIST61(m, ...) CMT_GEN_LIST60(m, __VA_ARGS__), m(60, __VA_ARGS__)
+#define CMT_GEN_LIST62(m, ...) CMT_GEN_LIST61(m, __VA_ARGS__), m(61, __VA_ARGS__)
+#define CMT_GEN_LIST63(m, ...) CMT_GEN_LIST62(m, __VA_ARGS__), m(62, __VA_ARGS__)
+#define CMT_GEN_LIST64(m, ...) CMT_GEN_LIST63(m, __VA_ARGS__), m(63, __VA_ARGS__)
+#define CMT_GEN_LIST65(m, ...) CMT_GEN_LIST64(m, __VA_ARGS__), m(64, __VA_ARGS__)
+#define CMT_GEN_LIST66(m, ...) CMT_GEN_LIST65(m, __VA_ARGS__), m(65, __VA_ARGS__)
+#define CMT_GEN_LIST67(m, ...) CMT_GEN_LIST66(m, __VA_ARGS__), m(66, __VA_ARGS__)
+#define CMT_GEN_LIST68(m, ...) CMT_GEN_LIST67(m, __VA_ARGS__), m(67, __VA_ARGS__)
+#define CMT_GEN_LIST69(m, ...) CMT_GEN_LIST68(m, __VA_ARGS__), m(68, __VA_ARGS__)
+#define CMT_GEN_LIST70(m, ...) CMT_GEN_LIST69(m, __VA_ARGS__), m(69, __VA_ARGS__)
+
+#define CMT_GEN_LIST(c, m, ...) CMT_GEN_LIST##c(m, __VA_ARGS__)
+
+template <typename Tout, typename Tin>
+constexpr CMT_INLINE Tout bitcast_anything(const Tin& in)
+{
+ static_assert(sizeof(Tin) == sizeof(Tout), "Invalid arguments for bitcast_anything");
+#ifdef CMT_COMPILER_INTEL
+ const union {
+ const Tin in;
+ Tout out;
+ } u{ in };
+ return u.out;
+#else
+ union {
+ Tin in;
+ Tout out;
+ } u{ in };
+ return u.out;
+#endif
+}
+
+template <typename T>
+constexpr T dont_deduce(T x)
+{
+ return x;
+}
+
+template <typename Ty, typename T>
+constexpr T just_value(T value)
+{
+ return value;
+}
+
+enum class special_constant
+{
+ undefined,
+ default_constructed,
+ infinity,
+ neg_infinity,
+ min,
+ max,
+ neg_max,
+ lowest,
+ epsilon,
+ integer,
+ floating_point,
+ random_bits,
+};
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4700))
+CMT_PRAGMA_MSVC(warning(disable : 4146))
+struct special_value
+{
+ constexpr special_value(const special_value&) = default;
+ constexpr special_value(special_constant c) : c(c), ll(0), d(0) {}
+ constexpr special_value(double d) : c(special_constant::floating_point), ll(0), d(d) {}
+ constexpr special_value(long long ll) : c(special_constant::integer), ll(ll), d(0) {}
+ constexpr special_value(int i) : c(special_constant::integer), ll(i), d(0) {}
+
+ template <typename T>
+ constexpr T get() const CMT_NOEXCEPT
+ {
+ switch (c)
+ {
+ CMT_PRAGMA_GNU(GCC diagnostic push)
+ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wuninitialized")
+ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized")
+ case special_constant::undefined:
+ T undef;
+ return undef;
+ CMT_PRAGMA_GNU(GCC diagnostic pop)
+ case special_constant::default_constructed:
+ return T{};
+ case special_constant::infinity:
+ return std::numeric_limits<subtype<T>>::infinity();
+ case special_constant::neg_infinity:
+ {
+ subtype<T> gg = std::numeric_limits<subtype<T>>::infinity();
+ return -gg;
+ }
+ case special_constant::min:
+ return std::numeric_limits<subtype<T>>::min();
+ case special_constant::max:
+ return std::numeric_limits<subtype<T>>::max();
+ case special_constant::neg_max:
+ return static_cast<T>(-std::numeric_limits<subtype<T>>::max());
+ case special_constant::lowest:
+ return std::numeric_limits<subtype<T>>::lowest();
+ case special_constant::integer:
+ return static_cast<T>(ll);
+ case special_constant::floating_point:
+ return static_cast<T>(d);
+ case special_constant::random_bits:
+ return random_bits<T>();
+ default:
+ return T{};
+ }
+ }
+
+ template <typename T>
+ constexpr operator T() const CMT_NOEXCEPT
+ {
+ return get<T>();
+ }
+ special_constant c;
+ long long ll;
+ double d;
+
+ static std::mt19937& random_generator()
+ {
+ static std::mt19937 rnd(1);
+ return rnd;
+ }
+
+ template <typename T>
+ static T random_bits()
+ {
+ union {
+ uint32_t bits[(sizeof(T) + sizeof(uint32_t) - 1) / sizeof(uint32_t)];
+ T value;
+ } u;
+ for (uint32_t& b : u.bits)
+ {
+ b = random_generator()();
+ }
+ return u.value;
+ }
+};
+CMT_PRAGMA_MSVC(warning(pop))
+
CMT_PRAGMA_GNU(GCC diagnostic pop)
} // namespace cometa
diff --git a/include/kfr/cometa/array.hpp b/include/kfr/cometa/array.hpp
@@ -28,31 +28,32 @@ public:
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
- constexpr array_ref() noexcept : m_data(nullptr), m_size(0) {}
- constexpr array_ref(const array_ref&) noexcept = default;
- constexpr array_ref(array_ref&&) noexcept = default;
+ constexpr array_ref() CMT_NOEXCEPT : m_data(nullptr), m_size(0) {}
+ constexpr array_ref(const array_ref&) CMT_NOEXCEPT = default;
+ constexpr array_ref(array_ref&&) CMT_NOEXCEPT = default;
#ifdef CMT_COMPILER_GNU
- constexpr array_ref& operator=(const array_ref&) noexcept = default;
- constexpr array_ref& operator=(array_ref&&) noexcept = default;
+ constexpr array_ref& operator=(const array_ref&) CMT_NOEXCEPT = default;
+ constexpr array_ref& operator=(array_ref&&) CMT_NOEXCEPT = default;
#else
array_ref& operator=(const array_ref&) = default;
array_ref& operator=(array_ref&&) = default;
#endif
template <size_t N>
- constexpr array_ref(value_type (&arr)[N]) noexcept : m_data(arr), m_size(N)
+ constexpr array_ref(value_type (&arr)[N]) CMT_NOEXCEPT : m_data(arr), m_size(N)
{
}
template <size_t N>
- constexpr array_ref(const std::array<T, N>& arr) noexcept : m_data(arr.data()), m_size(N)
+ constexpr array_ref(const std::array<T, N>& arr) CMT_NOEXCEPT : m_data(arr.data()), m_size(N)
{
}
template <size_t N>
- constexpr array_ref(std::array<T, N>& arr) noexcept : m_data(arr.data()), m_size(N)
+ constexpr array_ref(std::array<T, N>& arr) CMT_NOEXCEPT : m_data(arr.data()), m_size(N)
{
}
template <typename Alloc>
- constexpr array_ref(const std::vector<T, Alloc>& vec) noexcept : m_data(vec.data()), m_size(vec.size())
+ constexpr array_ref(const std::vector<T, Alloc>& vec) CMT_NOEXCEPT : m_data(vec.data()),
+ m_size(vec.size())
{
}
@@ -61,26 +62,26 @@ public:
{
}
- constexpr array_ref(const std::initializer_list<T>& vec) noexcept
- : m_data(vec.begin()), m_size(vec.size())
+ constexpr array_ref(const std::initializer_list<T>& vec) CMT_NOEXCEPT : m_data(vec.begin()),
+ m_size(vec.size())
{
}
template <typename InputIter>
- constexpr array_ref(InputIter first, InputIter last) noexcept
- : m_data(std::addressof(*first)), m_size(std::distance(first, last))
+ constexpr array_ref(InputIter first, InputIter last) CMT_NOEXCEPT : m_data(std::addressof(*first)),
+ m_size(std::distance(first, last))
{
}
- constexpr array_ref(T* data, size_type size) noexcept : m_data(data), m_size(size) {}
-
- constexpr reference front() const noexcept { return m_data[0]; }
- constexpr reference back() const noexcept { return m_data[m_size - 1]; }
- constexpr iterator begin() const noexcept { return m_data; }
- constexpr iterator end() const noexcept { return m_data + m_size; }
- constexpr const_iterator cbegin() const noexcept { return m_data; }
- constexpr const_iterator cend() const noexcept { return m_data + m_size; }
- constexpr pointer data() const noexcept { return m_data; }
- constexpr std::size_t size() const noexcept { return m_size; }
- constexpr bool empty() const noexcept { return !m_size; }
+ constexpr array_ref(T* data, size_type size) CMT_NOEXCEPT : m_data(data), m_size(size) {}
+
+ constexpr reference front() const CMT_NOEXCEPT { return m_data[0]; }
+ constexpr reference back() const CMT_NOEXCEPT { return m_data[m_size - 1]; }
+ constexpr iterator begin() const CMT_NOEXCEPT { return m_data; }
+ constexpr iterator end() const CMT_NOEXCEPT { return m_data + m_size; }
+ constexpr const_iterator cbegin() const CMT_NOEXCEPT { return m_data; }
+ constexpr const_iterator cend() const CMT_NOEXCEPT { return m_data + m_size; }
+ constexpr pointer data() const CMT_NOEXCEPT { return m_data; }
+ constexpr std::size_t size() const CMT_NOEXCEPT { return m_size; }
+ constexpr bool empty() const CMT_NOEXCEPT { return !m_size; }
constexpr reference operator[](std::size_t index) const { return m_data[index]; }
private:
@@ -126,22 +127,22 @@ inline array_ref<const T> make_array_ref(const std::vector<T>& cont)
}
template <typename C>
-constexpr auto datatype(C& c)
+constexpr auto elementtype(C& c)
{
return c[0];
}
template <typename C>
-constexpr auto datatype(const C& c)
+constexpr auto elementtype(const C& c)
{
return c[0];
}
template <typename E>
-constexpr E datatype(const std::initializer_list<E>& il)
+constexpr E elementtype(const std::initializer_list<E>&)
{
return {};
}
template <typename T, std::size_t N>
-constexpr T datatype(T (&array)[N])
+constexpr T elementtype(T (&)[N])
{
return {};
}
@@ -157,17 +158,17 @@ constexpr auto data(const C& c) -> decltype(c.data())
return c.data();
}
template <typename T, std::size_t N>
-constexpr T* data(T (&array)[N]) noexcept
+constexpr T* data(T (&array)[N]) CMT_NOEXCEPT
{
return array;
}
template <typename T>
-constexpr T* data(T* array) noexcept
+constexpr T* data(T* array) CMT_NOEXCEPT
{
return array;
}
template <typename E>
-constexpr const E* data(const std::initializer_list<E>& il) noexcept
+constexpr const E* data(const std::initializer_list<E>& il) CMT_NOEXCEPT
{
return il.begin();
}
@@ -178,7 +179,7 @@ constexpr auto size(const C& c) -> decltype(c.size())
return c.size();
}
template <typename T, std::size_t N>
-constexpr std::size_t size(const T (&array)[N]) noexcept
+constexpr std::size_t size(const T (&)[N]) CMT_NOEXCEPT
{
return N;
}
diff --git a/include/kfr/cometa/cstring.hpp b/include/kfr/cometa/cstring.hpp
@@ -24,48 +24,48 @@ struct cstring
using value_type = char;
using size_type = size_t;
- constexpr const value_type* c_str() const noexcept { return value; }
- constexpr const value_type* data() const noexcept { return value; }
+ constexpr const value_type* c_str() const CMT_NOEXCEPT { return value; }
+ constexpr const value_type* data() const CMT_NOEXCEPT { return value; }
const value_type value[N];
- constexpr size_type length() const noexcept { return N - 1; }
- constexpr size_type size() const noexcept { return N; }
+ constexpr size_type length() const CMT_NOEXCEPT { return N - 1; }
+ constexpr size_type size() const CMT_NOEXCEPT { return N; }
template <size_t start, size_t count>
- constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const noexcept
+ constexpr cstring<count> slice(csize_t<start>, csize_t<count>) const CMT_NOEXCEPT
{
- return slice_impl(csizeseq_t<count, start>());
+ return slice_impl(csizeseq<count, start>);
}
template <size_t start>
- constexpr cstring<N - start> slice(csize_t<start>) const noexcept
+ constexpr cstring<N - start> slice(csize_t<start>) const CMT_NOEXCEPT
{
- return slice_impl(csizeseq_t<N - 1 - start, start>());
+ return slice_impl(csizeseq<N - 1 - start, start>);
}
- constexpr friend bool operator==(const cstring& left, const cstring& right) noexcept
+ constexpr friend bool operator==(const cstring& left, const cstring& right) CMT_NOEXCEPT
{
for (size_t i = 0; i < 1; i++)
if (left.value[i] != right.value[i])
return false;
return true;
}
- constexpr friend bool operator!=(const cstring& left, const cstring& right) noexcept
+ constexpr friend bool operator!=(const cstring& left, const cstring& right) CMT_NOEXCEPT
{
return !(left == right);
}
template <size_t NN>
- constexpr bool operator==(const cstring<NN>& other) const noexcept
+ constexpr bool operator==(const cstring<NN>&) const CMT_NOEXCEPT
{
return false;
}
template <size_t NN>
- constexpr bool operator!=(const cstring<NN>& other) const noexcept
+ constexpr bool operator!=(const cstring<NN>&) const CMT_NOEXCEPT
{
return true;
}
- constexpr char operator[](size_t index) const noexcept { return value[index]; }
+ constexpr char operator[](size_t index) const CMT_NOEXCEPT { return value[index]; }
private:
template <size_t... indices>
@@ -98,9 +98,9 @@ CMT_INLINE constexpr cstring<N1 - 1 + N2 - 1 + 1> concat_str_impl(const cstring<
return concat_str_impl(str1, str2, cvalseq_t<size_t, N1 - 1 + N2 - 1>());
}
template <size_t N1, size_t Nfrom, size_t Nto, size_t... indices>
-CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str,
- const cstring<Nfrom>&, const cstring<Nto>& to,
- csizes_t<indices...>)
+CMT_INTRINSIC cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<N1>& str,
+ const cstring<Nfrom>&, const cstring<Nto>& to,
+ csizes_t<indices...>)
{
if (pos == size_t(-1))
stop_constexpr();
@@ -111,35 +111,35 @@ CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace_impl(size_t pos, const cstring<
}
} // namespace details
-CMT_INTRIN constexpr cstring<1> concat_cstring() { return { { 0 } }; }
+CMT_INTRINSIC constexpr cstring<1> concat_cstring() { return { { 0 } }; }
template <size_t N1>
-CMT_INTRIN constexpr cstring<N1> concat_cstring(const cstring<N1>& str1)
+CMT_INTRINSIC constexpr cstring<N1> concat_cstring(const cstring<N1>& str1)
{
return str1;
}
template <size_t N1, size_t N2, typename... Args>
-CMT_INTRIN constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2,
- const Args&... args)
+CMT_INTRINSIC constexpr auto concat_cstring(const cstring<N1>& str1, const cstring<N2>& str2,
+ const Args&... args)
{
return details::concat_str_impl(str1, concat_cstring(str2, args...));
}
template <size_t N>
-CMT_INTRIN constexpr cstring<N> make_cstring(const char (&str)[N])
+CMT_INTRINSIC constexpr cstring<N> make_cstring(const char (&str)[N])
{
return details::make_cstring_impl(str, cvalseq_t<size_t, N - 1>());
}
template <char... chars>
-CMT_INTRIN constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>)
+CMT_INTRINSIC constexpr cstring<sizeof...(chars) + 1> make_cstring(cchars_t<chars...>)
{
return { { chars..., 0 } };
}
template <size_t N1, size_t Nneedle>
-CMT_INTRIN size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needle)
+CMT_INTRINSIC size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needle)
{
size_t count = 0;
for (size_t i = 0; i < N1; i++)
@@ -155,8 +155,8 @@ CMT_INTRIN size_t str_find(const cstring<N1>& str, const cstring<Nneedle>& needl
}
template <size_t N1, size_t Nfrom, size_t Nto>
-CMT_INTRIN cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from,
- const cstring<Nto>& to)
+CMT_INTRINSIC cstring<N1 - Nfrom + Nto> str_replace(const cstring<N1>& str, const cstring<Nfrom>& from,
+ const cstring<Nto>& to)
{
return details::str_replace_impl(str_find(str, from), str, from, to,
cvalseq_t<size_t, N1 - Nfrom + Nto - 1>());
diff --git a/include/kfr/cometa/ctti.hpp b/include/kfr/cometa/ctti.hpp
@@ -12,7 +12,7 @@ using pconstvoid = const void*;
struct type_id_t
{
- constexpr type_id_t(const void* id) noexcept : id(id) {}
+ constexpr type_id_t(const void* id) CMT_NOEXCEPT : id(id) {}
constexpr bool operator==(type_id_t other) const { return id == other.id; }
constexpr bool operator!=(type_id_t other) const { return !(id == other.id); }
const void* const id;
@@ -22,7 +22,7 @@ namespace details
{
template <typename T>
-constexpr inline type_id_t typeident_impl() noexcept
+constexpr inline type_id_t typeident_impl() CMT_NOEXCEPT
{
return type_id_t(pconstvoid(&typeident_impl<T>));
}
@@ -30,21 +30,32 @@ constexpr inline type_id_t typeident_impl() noexcept
#ifdef CMT_COMPILER_CLANG
constexpr size_t typename_prefix = sizeof("auto cometa::ctype_name() [T = ") - 1;
constexpr size_t typename_postfix = sizeof("]") - 1;
+#elif CMT_COMPILER_MSVC
+constexpr size_t typename_prefix = sizeof("auto __cdecl cometa::ctype_name<") - 1;
+constexpr size_t typename_postfix = sizeof(">(void) noexcept") - 1;
#else
constexpr size_t typename_prefix = sizeof("constexpr auto cometa::ctype_name() [with T = ") - 1;
constexpr size_t typename_postfix = sizeof("]") - 1;
#endif
template <size_t... indices, size_t Nout = 1 + sizeof...(indices)>
-constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) noexcept
+constexpr cstring<Nout> gettypename_impl(const char* str, csizes_t<indices...>) CMT_NOEXCEPT
{
return cstring<Nout>{ { (str[indices])..., 0 } };
}
} // namespace details
+#ifdef CMT_COMPILER_MSVC
+#define KFR_CALL_CONV_SPEC __cdecl
+#else
+#define KFR_CALL_CONV_SPEC
+#endif
+
template <typename T>
-constexpr auto ctype_name() noexcept
+constexpr auto KFR_CALL_CONV_SPEC ctype_name() CMT_NOEXCEPT
{
+ static_assert(details::typename_prefix + details::typename_postfix + 1 <= sizeof(CMT_FUNC_SIGNATURE) - 1,
+ "Incorrect details::typename_prefix or details::typename_postfix");
return details::gettypename_impl(CMT_FUNC_SIGNATURE + details::typename_prefix,
csizeseq_t<(sizeof(CMT_FUNC_SIGNATURE) - 1 - details::typename_prefix -
details::typename_postfix)>());
@@ -57,7 +68,7 @@ constexpr auto ctype_name() noexcept
* @return name of the type
*/
template <typename T>
-inline const char* type_name() noexcept
+inline const char* type_name() CMT_NOEXCEPT
{
static const auto name = ctype_name<T>();
return name.c_str();
@@ -70,7 +81,7 @@ inline const char* type_name() noexcept
* @return name of the type
*/
template <typename T>
-inline const char* type_name(T x) noexcept
+inline const char* type_name(T x) CMT_NOEXCEPT
{
(void)x;
return type_name<T>();
diff --git a/include/kfr/cometa/function.hpp b/include/kfr/cometa/function.hpp
@@ -16,20 +16,20 @@ struct virtual_function
{
virtual Result operator()(Args... args) = 0;
virtual virtual_function* make_copy() const = 0;
- CMT_INTRIN virtual ~virtual_function() = default;
+ virtual ~virtual_function() = default;
};
template <typename Fn, typename Result, typename... Args>
struct virtual_function_impl : virtual_function<Result, Args...>
{
public:
- CMT_INTRIN virtual_function_impl(const Fn& fn) : fn(fn) {}
- CMT_INTRIN Result operator()(Args... args) override final { return fn(args...); }
- CMT_INTRIN virtual_function<Result, Args...>* make_copy() const override final
+ CMT_MEM_INTRINSIC virtual_function_impl(const Fn& fn) : fn(fn) {}
+ CMT_MEM_INTRINSIC Result operator()(Args... args) final { return fn(args...); }
+ CMT_MEM_INTRINSIC virtual_function<Result, Args...>* make_copy() const final
{
return new virtual_function_impl{ fn };
}
- CMT_INTRIN ~virtual_function_impl() {}
+ CMT_MEM_INTRINSIC ~virtual_function_impl() {}
private:
Fn fn;
@@ -47,13 +47,13 @@ struct func_filter<Result(Args...)>
};
template <typename T>
-constexpr CMT_INTRIN T return_val() noexcept
+constexpr CMT_INTRINSIC T return_val() CMT_NOEXCEPT
{
return {};
}
template <>
-constexpr CMT_INTRIN void return_val<void>() noexcept
+constexpr CMT_INTRINSIC void return_val<void>() CMT_NOEXCEPT
{
}
} // namespace details
@@ -81,16 +81,16 @@ struct function<Result(Args...)>
return *this;
}
- CMT_INTRIN function() : fn(nullptr) {}
- CMT_INTRIN function(std::nullptr_t) : fn(nullptr) {}
+ CMT_MEM_INTRINSIC function() : fn(nullptr) {}
+ CMT_MEM_INTRINSIC function(std::nullptr_t) : fn(nullptr) {}
template <typename Func>
- CMT_INTRIN function(const Func& x)
+ CMT_MEM_INTRINSIC function(const Func& x)
: fn(new details::virtual_function_impl<typename details::func_filter<Func>::type, Result, Args...>(
x))
{
}
function(const this_t& other) : fn(other.fn ? other.fn->make_copy() : nullptr) {}
- CMT_INTRIN function& operator=(const this_t& other)
+ CMT_MEM_INTRINSIC function& operator=(const this_t& other)
{
if ((&other != this) && (other.fn))
{
@@ -100,14 +100,14 @@ struct function<Result(Args...)>
}
return *this;
}
- CMT_INTRIN function& operator=(std::nullptr_t)
+ CMT_MEM_INTRINSIC function& operator=(std::nullptr_t)
{
delete fn;
fn = nullptr;
return *this;
}
template <typename Fn>
- CMT_INTRIN function& operator=(const Fn& x)
+ CMT_MEM_INTRINSIC function& operator=(const Fn& x)
{
using FnImpl =
details::virtual_function_impl<typename details::func_filter<Fn>::type, Result, Args...>;
@@ -116,15 +116,15 @@ struct function<Result(Args...)>
fn = temp;
return *this;
}
- CMT_INTRIN Result operator()(Args... args) const { return (*fn)(std::forward<Args>(args)...); }
+ CMT_MEM_INTRINSIC Result operator()(Args... args) const { return (*fn)(std::forward<Args>(args)...); }
template <typename TResult>
- CMT_INTRIN Result call(TResult&& default_result, Args... args) const
+ CMT_MEM_INTRINSIC Result call(TResult&& default_result, Args... args) const
{
return fn ? (*fn)(std::forward<Args>(args)...) : std::forward<TResult>(default_result);
}
- CMT_INTRIN explicit operator bool() const noexcept { return !!fn; }
+ CMT_MEM_INTRINSIC explicit operator bool() const CMT_NOEXCEPT { return !!fn; }
- CMT_INTRIN ~function() { delete fn; }
+ CMT_MEM_INTRINSIC ~function() { delete fn; }
private:
details::virtual_function<Result, Args...>* fn;
diff --git a/include/kfr/cometa/named_arg.hpp b/include/kfr/cometa/named_arg.hpp
@@ -19,10 +19,10 @@ struct named_arg
struct named
{
- constexpr named(const char* name) noexcept : name(name) {}
+ constexpr named(const char* name) CMT_NOEXCEPT : name(name) {}
template <typename T>
- CMT_INTRIN constexpr named_arg<T> operator=(T&& value)
+ CMT_MEM_INTRINSIC constexpr named_arg<T> operator=(T&& value)
{
return named_arg<T>{ std::forward<T>(value), name };
}
diff --git a/include/kfr/cometa/numeric.hpp b/include/kfr/cometa/numeric.hpp
@@ -0,0 +1,194 @@
+/** @addtogroup cometa
+ * @{
+ */
+#pragma once
+
+#include "../cometa.hpp"
+
+namespace cometa
+{
+
+/// @brief Short names for common types
+using b8 = bool;
+using f32 = float;
+using f64 = double;
+using i8 = int8_t;
+using i16 = int16_t;
+using i32 = int32_t;
+using i64 = int64_t;
+using u8 = uint8_t;
+using u16 = uint16_t;
+using u32 = uint32_t;
+using u64 = uint64_t;
+using umax = uint64_t;
+using imax = int64_t;
+using fmax = double;
+using f80 = long double;
+
+#if defined(CMT_BASETYPE_F32) || defined(CMT_NO_NATIVE_F64)
+using fbase = float;
+#else
+using fbase = double;
+#endif
+
+namespace details
+{
+template <typename T>
+struct fix_type_impl
+{
+ using type = T;
+};
+
+template <>
+struct fix_type_impl<char>
+{
+ using type = i8;
+};
+
+template <>
+struct fix_type_impl<unsigned long>
+{
+#if ULONG_MAX == ULLONG_MAX
+ using type = u64;
+#else
+ using type = u32;
+#endif
+};
+
+template <>
+struct fix_type_impl<signed long>
+{
+#if LONG_MAX == LLONG_MAX
+ using type = i64;
+#else
+ using type = i32;
+#endif
+};
+
+template <>
+struct fix_type_impl<unsigned long long>
+{
+ using type = u64;
+};
+
+template <>
+struct fix_type_impl<signed long long>
+{
+ using type = i64;
+};
+
+} // namespace details
+
+template <typename T>
+using fix_type = typename details::fix_type_impl<T>::type;
+
+/// @brief An enumeration representing data type
+enum class datatype : int
+{
+ typebits_mask = 0xFF,
+ f = 0x100, // floating point
+ i = 0x200, // signed integer
+ u = 0x300, // unsigned integer
+ c = 0x400, // complex floating point
+ b = 0x500, // boolean
+ typeclass_mask = 0xF00,
+ f16 = static_cast<int>(f) | 16,
+ f32 = static_cast<int>(f) | 32,
+ f64 = static_cast<int>(f) | 64,
+ f80 = static_cast<int>(f) | 80,
+ i8 = static_cast<int>(i) | 8,
+ i16 = static_cast<int>(i) | 16,
+ i24 = static_cast<int>(i) | 24,
+ i32 = static_cast<int>(i) | 32,
+ i64 = static_cast<int>(i) | 64,
+ u8 = static_cast<int>(u) | 8,
+ u16 = static_cast<int>(u) | 16,
+ u24 = static_cast<int>(u) | 24,
+ u32 = static_cast<int>(u) | 32,
+ u64 = static_cast<int>(u) | 64,
+ c32 = static_cast<int>(c) | 32,
+ c64 = static_cast<int>(c) | 64,
+ b8 = static_cast<int>(b) | 8
+};
+
+constexpr inline datatype operator|(datatype x, datatype y)
+{
+ using type = underlying_type<datatype>;
+ return static_cast<datatype>(static_cast<type>(x) | static_cast<type>(y));
+}
+
+constexpr inline datatype operator&(datatype x, datatype y)
+{
+ using type = underlying_type<datatype>;
+ return static_cast<datatype>(static_cast<type>(x) & static_cast<type>(y));
+}
+
+template <typename T>
+constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value
+ ? datatype::f
+ : std::is_integral<typename compound_type_traits<T>::subtype>::value
+ ? (std::is_unsigned<typename compound_type_traits<T>::subtype>::value
+ ? datatype::u
+ : datatype::i)
+ : datatype();
+
+template <typename T>
+using is_f_class = std::integral_constant<bool, typeclass<T> == datatype::f>;
+template <typename T>
+using is_u_class = std::integral_constant<bool, typeclass<T> == datatype::u>;
+template <typename T>
+using is_i_class = std::integral_constant<bool, typeclass<T> == datatype::i>;
+
+template <typename T>
+struct typebits
+{
+ static_assert(is_number<deep_subtype<T>>::value, "");
+ constexpr static size_t bits = sizeof(typename compound_type_traits<T>::subtype) * 8;
+ constexpr static size_t width = compound_type_traits<T>::is_scalar ? 0 : compound_type_traits<T>::width;
+ using subtype = typename compound_type_traits<T>::subtype;
+};
+
+template <typename T>
+using ftype =
+ typename compound_type_traits<T>::template deep_rebind<float_type<typebits<deep_subtype<T>>::bits>>;
+template <typename T>
+using itype =
+ typename compound_type_traits<T>::template deep_rebind<int_type<typebits<deep_subtype<T>>::bits>>;
+template <typename T>
+using utype =
+ typename compound_type_traits<T>::template deep_rebind<unsigned_type<typebits<deep_subtype<T>>::bits>>;
+
+template <typename T>
+using uitype = conditional<is_i_class<deep_subtype<T>>::value, T, utype<T>>;
+
+template <typename T>
+using fsubtype = ftype<subtype<T>>;
+template <typename T>
+using isubtype = itype<subtype<T>>;
+template <typename T>
+using usubtype = utype<subtype<T>>;
+namespace details
+{
+template <typename T>
+struct flt_type_impl
+{
+ using type = conditional<sizeof(T) <= 2, float, fbase>;
+};
+
+template <>
+struct flt_type_impl<float>
+{
+ using type = float;
+};
+template <>
+struct flt_type_impl<double>
+{
+ using type = double;
+};
+} // namespace details
+
+template <typename T>
+using flt_type = typename cometa::compound_type_traits<T>::template deep_rebind<
+ typename details::flt_type_impl<deep_subtype<T>>::type>;
+
+} // namespace cometa
diff --git a/include/kfr/cometa/range.hpp b/include/kfr/cometa/range.hpp
@@ -19,8 +19,9 @@ struct range
using const_pointer = const T*;
using diff_type = decltype(std::declval<T>() - std::declval<T>());
- constexpr range(value_type begin, value_type end, diff_type step) noexcept
- : value_begin(begin), value_end(end), step(step)
+ constexpr range(value_type begin, value_type end, diff_type step) CMT_NOEXCEPT : min(begin),
+ max(end),
+ step(step)
{
}
@@ -28,42 +29,44 @@ struct range
{
value_type value;
diff_type step;
- const_reference operator*() const { return value; }
- const_pointer operator->() const { return &value; }
- iterator& operator++()
+ constexpr const_reference operator*() const { return value; }
+ constexpr const_pointer operator->() const { return &value; }
+ constexpr iterator& operator++()
{
value += step;
return *this;
}
- iterator operator++(int)
+ constexpr iterator operator++(int)
{
iterator copy = *this;
++(*this);
return copy;
}
- bool operator!=(const iterator& other) const
+ constexpr bool operator!=(const iterator& other) const
{
return step > 0 ? value < other.value : value > other.value;
}
};
- value_type value_begin;
- value_type value_end;
+ value_type min;
+ value_type max;
diff_type step;
- iterator begin() const { return iterator{ value_begin, step }; }
- iterator end() const { return iterator{ value_end, step }; }
+ constexpr iterator begin() const { return iterator{ min, step }; }
+ constexpr iterator end() const { return iterator{ max, step }; }
+
+ constexpr T distance() const { return max - min; }
};
/// @brief Make iterable range object
template <typename T>
-range<T> make_range(T begin, T end)
+constexpr range<T> make_range(T begin, T end)
{
return range<T>(begin, end, end > begin ? 1 : -1);
}
/// @brief Make iterable range object with step
-template <typename T, typename diff_type = decltype(std::declval<T>() - std::declval<T>())>
-range<T> make_range(T begin, T end, diff_type step)
+template <typename T, typename D>
+constexpr range<std::common_type_t<T, D>> make_range(T begin, T end, D step)
{
- return range<T>(begin, end, step);
+ return range<std::common_type_t<T, D>>(begin, end, step);
}
} // namespace cometa
diff --git a/include/kfr/cometa/result.hpp b/include/kfr/cometa/result.hpp
@@ -20,18 +20,19 @@ struct result
constexpr static error_type ok_value = OkValue;
- constexpr result(const result&) = default;
- constexpr result(result&&) noexcept = default;
+ constexpr result(const result&) = default;
+ constexpr result(result&&) CMT_NOEXCEPT = default;
- constexpr result(ErrEnum error) noexcept : m_error(error) {}
+ constexpr result(ErrEnum error) CMT_NOEXCEPT : m_error(error) {}
template <typename ValueInit, CMT_ENABLE_IF(std::is_constructible<value_type, ValueInit>::value)>
- constexpr result(ValueInit&& value) noexcept : m_value(std::forward<ValueInit>(value)), m_error(OkValue)
+ constexpr result(ValueInit&& value) CMT_NOEXCEPT : m_value(std::forward<ValueInit>(value)),
+ m_error(OkValue)
{
}
- constexpr result(const Type& value) noexcept : m_value(value), m_error(OkValue) {}
- constexpr result(Type&& value) noexcept : m_value(std::move(value)), m_error(OkValue) {}
+ constexpr result(const Type& value) CMT_NOEXCEPT : m_value(value), m_error(OkValue) {}
+ constexpr result(Type&& value) CMT_NOEXCEPT : m_value(std::move(value)), m_error(OkValue) {}
constexpr explicit operator bool() const { return m_error == OkValue; }
constexpr const_reference operator*() const { return m_value; }
diff --git a/include/kfr/cometa/string.hpp b/include/kfr/cometa/string.hpp
@@ -27,7 +27,7 @@ template <typename T>
struct representation
{
using type = T;
- static constexpr const T& get(const T& value) noexcept { return value; }
+ static constexpr const T& get(const T& value) CMT_NOEXCEPT { return value; }
};
template <typename T>
@@ -175,7 +175,7 @@ CMT_INLINE auto pack_value(const fmt_t<T, t, width, prec>& value)
}
template <typename T>
-CMT_INLINE auto pack_value(const T& value)
+CMT_INLINE auto pack_value(const T&)
{
return pack_value(type_name<T>());
}
@@ -218,7 +218,7 @@ CMT_INLINE constexpr cstring<N1 - 3 + Nnew> fmt_replace_impl(const cstring<N1>&
template <size_t N1, size_t Nto>
CMT_INLINE constexpr cstring<N1 - 3 + Nto> fmt_replace(const cstring<N1>& str, const cstring<Nto>& newfmt)
{
- return fmt_replace_impl(str, newfmt, csizeseq_t<N1 - 3 + Nto - 1>());
+ return fmt_replace_impl(str, newfmt, csizeseq<N1 - 3 + Nto - 1>);
}
inline std::string replace_one(const std::string& str, const std::string& from, const std::string& to)
@@ -305,7 +305,7 @@ struct print_t
}
};
-#ifdef CMT_COMPILER_GNU
+#if defined CMT_COMPILER_GNU && !defined(CMT_COMPILER_INTEL)
template <typename Char, Char... chars>
constexpr format_t<chars...> operator""_format()
diff --git a/include/kfr/cpuid.hpp b/include/kfr/cpuid.hpp
@@ -1,26 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "cpuid/cpuid.hpp"
-#include "cpuid/cpuid_auto.hpp"
diff --git a/include/kfr/cpuid/cpuid.hpp b/include/kfr/cpuid/cpuid.hpp
@@ -1,297 +0,0 @@
-/** @addtogroup cpuid
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-#include "../base/platform.hpp"
-#include "../base/types.hpp"
-#include <cstring>
-
-namespace kfr
-{
-#ifdef CMT_ARCH_X86
-
-struct cpu_features
-{
- u32 max;
- u32 exmax;
- u32 isIntel : 1;
- u32 isAMD : 1;
- u32 has3DNOW : 1;
- u32 has3DNOWEXT : 1;
- u32 hasABM : 1;
- u32 hasADX : 1;
- u32 hasAES : 1;
- u32 hasAVX : 1;
- u32 hasAVX2 : 1;
- u32 hasAVXOSSUPPORT : 1;
- u32 hasAVX512OSSUPPORT : 1;
- u32 hasAVX512CD : 1;
- u32 hasAVX512ER : 1;
- u32 hasAVX512F : 1;
- u32 hasAVX512DQ : 1;
- u32 hasAVX512PF : 1;
- u32 hasAVX512BW : 1;
- u32 hasAVX512VL : 1;
- u32 hasBMI1 : 1;
- u32 hasBMI2 : 1;
- u32 hasCLFSH : 1;
- u32 hasCMOV : 1;
- u32 hasCMPXCHG16B : 1;
- u32 hasCX8 : 1;
- u32 hasERMS : 1;
- u32 hasF16C : 1;
- u32 hasFMA : 1;
- u32 hasFSGSBASE : 1;
- u32 hasFXSR : 1;
- u32 hasHLE : 1;
- u32 hasINVPCID : 1;
- u32 hasLAHF : 1;
- u32 hasLZCNT : 1;
- u32 hasMMX : 1;
- u32 hasMMXEXT : 1;
- u32 hasMONITOR : 1;
- u32 hasMOVBE : 1;
- u32 hasMSR : 1;
- u32 hasOSXSAVE : 1;
- u32 hasPCLMULQDQ : 1;
- u32 hasPOPCNT : 1;
- u32 hasPREFETCHWT1 : 1;
- u32 hasRDRAND : 1;
- u32 hasRDSEED : 1;
- u32 hasRDTSCP : 1;
- u32 hasRTM : 1;
- u32 hasSEP : 1;
- u32 hasSHA : 1;
- u32 hasSSE : 1;
- u32 hasSSE2 : 1;
- u32 hasSSE3 : 1;
- u32 hasSSE41 : 1;
- u32 hasSSE42 : 1;
- u32 hasSSE4a : 1;
- u32 hasSSSE3 : 1;
- u32 hasSYSCALL : 1;
- u32 hasTBM : 1;
- u32 hasXOP : 1;
- u32 hasXSAVE : 1;
- u32 padding1 : 6;
- char vendor[17];
- char model[49];
- char padding2[2];
-};
-
-namespace internal
-{
-
-struct cpu_data
-{
- u32 data[4];
-};
-
-#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG
-CMT_INLINE u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx)
-{
- __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc));
- return 1;
-}
-CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
-{
- get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]);
-}
-CMT_INLINE u32 get_xcr0()
-{
- u32 xcr0;
- __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
- return xcr0;
-}
-#elif defined CMT_COMPILER_MSVC
-
-CMT_INLINE void cpuid(u32* ptr, u32 func, u32 subfunc = 0) { __cpuidex((int*)ptr, (int)func, (int)subfunc); }
-CMT_INLINE u32 get_xcr0()
-{
-#ifdef _XCR_XFEATURE_ENABLED_MASK
- unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
- return (u32)Result;
-#else
- return 0;
-#endif
-}
-#endif
-
-template <size_t = 0>
-cpu_t detect_cpu()
-{
- cpu_features c;
- memset(&c, 0, sizeof(c));
- cpu_data data0;
- cpu_data exdata0;
-
- u32 f_1_ECX(0);
- u32 f_1_EDX(0);
- u32 f_7_EBX(0);
- u32 f_7_ECX(0);
- u32 f_81_ECX(0);
- u32 f_81_EDX(0);
-
- cpuid(data0.data, 0);
- c.max = static_cast<u32>(data0.data[0]);
- cpuid(exdata0.data, 0x80000000);
- c.exmax = static_cast<u32>(exdata0.data[0]);
-
- *ptr_cast<u32>(c.vendor) = static_cast<u32>(data0.data[1]);
- *ptr_cast<u32>(c.vendor + 4) = static_cast<u32>(data0.data[3]);
- *ptr_cast<u32>(c.vendor + 8) = static_cast<u32>(data0.data[2]);
-
- c.isIntel = strncmp(c.vendor, "GenuineIntel", sizeof(c.vendor)) == 0 ? 1 : 0;
- c.isAMD = strncmp(c.vendor, "AuthenticAMD", sizeof(c.vendor)) == 0 ? 1 : 0;
-
- if (c.max >= 1)
- {
- cpu_data data1;
- cpuid(data1.data, 1);
- f_1_ECX = static_cast<u32>(data1.data[2]);
- f_1_EDX = static_cast<u32>(data1.data[3]);
- }
-
- if (c.max >= 7)
- {
- cpu_data data7;
- cpuid(data7.data, 7);
- f_7_EBX = static_cast<u32>(data7.data[1]);
- f_7_ECX = static_cast<u32>(data7.data[2]);
- }
-
- if (c.exmax >= 0x80000001)
- {
- cpu_data data81;
- cpuid(data81.data, 0x80000001);
- f_81_ECX = static_cast<u32>(data81.data[2]);
- f_81_EDX = static_cast<u32>(data81.data[3]);
- }
-
- if (c.exmax >= 0x80000004)
- {
- cpu_data data82;
- cpu_data data83;
- cpu_data data84;
- cpuid(data82.data, 0x80000002);
- cpuid(data83.data, 0x80000003);
- cpuid(data84.data, 0x80000004);
- memcpy(c.model, data82.data, sizeof(cpu_data));
- memcpy(c.model + 16, data83.data, sizeof(cpu_data));
- memcpy(c.model + 32, data84.data, sizeof(cpu_data));
- }
-
- c.hasSSE3 = f_1_ECX >> 0 & 1;
- c.hasPCLMULQDQ = f_1_ECX >> 1 & 1;
- c.hasMONITOR = f_1_ECX >> 3 & 1;
- c.hasSSSE3 = f_1_ECX >> 9 & 1;
- c.hasFMA = f_1_ECX >> 12 & 1;
- c.hasCMPXCHG16B = f_1_ECX >> 13 & 1;
- c.hasSSE41 = f_1_ECX >> 19 & 1;
- c.hasSSE42 = f_1_ECX >> 20 & 1;
- c.hasMOVBE = f_1_ECX >> 22 & 1;
- c.hasPOPCNT = f_1_ECX >> 23 & 1;
- c.hasAES = f_1_ECX >> 25 & 1;
- c.hasXSAVE = f_1_ECX >> 26 & 1;
- c.hasOSXSAVE = f_1_ECX >> 27 & 1;
- c.hasAVX = f_1_ECX >> 28 & 1;
- c.hasF16C = f_1_ECX >> 29 & 1;
- c.hasRDRAND = f_1_ECX >> 30 & 1;
- c.hasMSR = f_1_EDX >> 5 & 1;
- c.hasCX8 = f_1_EDX >> 8 & 1;
- c.hasSEP = f_1_EDX >> 11 & 1;
- c.hasCMOV = f_1_EDX >> 15 & 1;
- c.hasCLFSH = f_1_EDX >> 19 & 1;
- c.hasMMX = f_1_EDX >> 23 & 1;
- c.hasFXSR = f_1_EDX >> 24 & 1;
- c.hasSSE = f_1_EDX >> 25 & 1;
- c.hasSSE2 = f_1_EDX >> 26 & 1;
- c.hasFSGSBASE = f_7_EBX >> 0 & 1;
- c.hasBMI1 = f_7_EBX >> 3 & 1;
- c.hasHLE = c.isIntel && f_7_EBX >> 4 & 1;
- c.hasAVX2 = f_7_EBX >> 5 & 1;
- c.hasBMI2 = f_7_EBX >> 8 & 1;
- c.hasERMS = f_7_EBX >> 9 & 1;
- c.hasINVPCID = f_7_EBX >> 10 & 1;
- c.hasRTM = c.isIntel && f_7_EBX >> 11 & 1;
- c.hasAVX512F = f_7_EBX >> 16 & 1;
- c.hasAVX512DQ = f_7_EBX >> 17 & 1;
- c.hasRDSEED = f_7_EBX >> 18 & 1;
- c.hasADX = f_7_EBX >> 19 & 1;
- c.hasAVX512PF = f_7_EBX >> 26 & 1;
- c.hasAVX512ER = f_7_EBX >> 27 & 1;
- c.hasAVX512CD = f_7_EBX >> 28 & 1;
- c.hasSHA = f_7_EBX >> 29 & 1;
- c.hasAVX512BW = f_7_EBX >> 30 & 1;
- c.hasAVX512VL = f_7_EBX >> 31 & 1;
- c.hasPREFETCHWT1 = f_7_ECX >> 0 & 1;
- c.hasLAHF = f_81_ECX >> 0 & 1;
- c.hasLZCNT = c.isIntel && f_81_ECX >> 5 & 1;
- c.hasABM = c.isAMD && f_81_ECX >> 5 & 1;
- c.hasSSE4a = c.isAMD && f_81_ECX >> 6 & 1;
- c.hasXOP = c.isAMD && f_81_ECX >> 11 & 1;
- c.hasTBM = c.isAMD && f_81_ECX >> 21 & 1;
- c.hasSYSCALL = c.isIntel && f_81_EDX >> 11 & 1;
- c.hasMMXEXT = c.isAMD && f_81_EDX >> 22 & 1;
- c.hasRDTSCP = c.isIntel && f_81_EDX >> 27 & 1;
- c.has3DNOWEXT = c.isAMD && f_81_EDX >> 30 & 1;
- c.has3DNOW = c.isAMD && f_81_EDX >> 31 & 1;
-
- c.hasAVXOSSUPPORT = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06;
- c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0;
-
- if (c.hasAVX512F && c.hasAVX512CD && c.hasAVX512VL && c.hasAVX512BW && c.hasAVX512DQ &&
- c.hasAVX512OSSUPPORT)
- return cpu_t::avx512;
- if (c.hasAVX2 && c.hasAVXOSSUPPORT)
- return cpu_t::avx2;
- if (c.hasAVX && c.hasAVXOSSUPPORT)
- return cpu_t::avx1;
- if (c.hasSSE41)
- return cpu_t::sse41;
- if (c.hasSSSE3)
- return cpu_t::ssse3;
- if (c.hasSSE3)
- return cpu_t::sse3;
- if (c.hasSSE2)
- return cpu_t::sse2;
- return cpu_t::lowest;
-}
-} // namespace internal
-#else
-
-template <size_t = 0>
-cpu_t detect_cpu()
-{
- return cpu_t::native;
-}
-
-#endif
-} // namespace kfr
diff --git a/include/kfr/cpuid/cpuid_auto.hpp b/include/kfr/cpuid/cpuid_auto.hpp
@@ -1,60 +0,0 @@
-/** @addtogroup cpuid
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "cpuid.hpp"
-
-namespace kfr
-{
-namespace internal
-{
-
-CMT_INLINE cpu_t& cpu_v()
-{
- static cpu_t v1 = cpu_t::native;
- return v1;
-}
-
-CMT_INLINE char init_cpu_v()
-{
- cpu_v() = detect_cpu<0>();
- return 0;
-}
-
-CMT_INLINE char init_dummyvar()
-{
- static char dummy = init_cpu_v();
- return dummy;
-}
-
-static char dummyvar = init_dummyvar();
-} // namespace internal
-
-/**
- * @brief Returns cpu instruction set detected at runtime.
- */
-CMT_INLINE cpu_t get_cpu() { return internal::cpu_v(); }
-} // namespace kfr
diff --git a/include/kfr/data/sincos.hpp b/include/kfr/data/sincos.hpp
@@ -1,192 +0,0 @@
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../base/kfr.h"
-#include "../base/types.hpp"
-#include <cstdint>
-
-namespace kfr
-{
-
-namespace data
-{
-
-template <typename T>
-constexpr T c_sin_table[65] = {
- /* sin(2*pi* 0/ 256) */ f32(0.0),
- /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547),
- /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475),
- /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133),
- /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114),
- /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752),
- /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062),
- /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663),
- /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277),
- /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484),
- /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321),
- /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421),
- /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915),
- /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934),
- /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778),
- /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023),
- /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613),
- /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119),
- /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046),
- /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832),
- /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575),
- /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397),
- /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608),
- /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603),
- /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749),
- /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776),
- /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119),
- /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594),
- /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757),
- /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769),
- /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288),
- /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455),
- /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848),
- /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831),
- /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729),
- /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404),
- /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971),
- /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565),
- /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796),
- /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471),
- /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386),
- /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198),
- /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425),
- /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108),
- /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084),
- /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006),
- /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101),
- /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447),
- /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224),
- /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295),
- /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856),
- /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522),
- /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828),
- /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631),
- /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569),
- /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644),
- /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739),
- /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601),
- /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837),
- /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108),
- /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755),
- /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717),
- /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432),
- /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501),
- /* sin(2*pi* 64/ 256) */ f32(1.0000000000000000000000000000000000000000)
-};
-
-// data generated by mpfr
-template <>
-constexpr f64 c_sin_table<f64>[65] = {
- /* sin(2*pi* 0/ 256) */ f64(0.0),
- /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547),
- /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475),
- /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133),
- /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114),
- /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752),
- /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062),
- /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663),
- /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277),
- /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484),
- /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321),
- /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421),
- /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915),
- /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934),
- /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778),
- /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023),
- /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613),
- /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119),
- /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046),
- /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832),
- /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575),
- /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397),
- /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608),
- /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603),
- /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749),
- /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776),
- /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119),
- /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594),
- /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757),
- /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769),
- /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288),
- /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455),
- /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848),
- /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831),
- /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729),
- /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404),
- /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971),
- /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565),
- /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796),
- /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471),
- /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386),
- /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198),
- /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425),
- /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108),
- /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084),
- /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006),
- /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101),
- /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447),
- /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224),
- /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295),
- /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856),
- /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522),
- /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828),
- /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631),
- /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569),
- /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644),
- /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739),
- /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601),
- /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837),
- /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108),
- /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755),
- /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717),
- /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432),
- /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501),
- /* sin(2*pi* 64/ 256) */ f64(1.0000000000000000000000000000000000000000)
-};
-
-} // namespace data
-
-template <typename T>
-constexpr inline T sin_using_table_256(size_t k)
-{
- return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128];
-}
-
-template <typename T>
-constexpr inline T sin_using_table(size_t size, size_t k)
-{
- return sin_using_table_256<T>((k * 256 / size) % 256);
-}
-template <typename T>
-constexpr inline T cos_using_table(size_t size, size_t k)
-{
- return sin_using_table<T>(size, k + size / 4);
-}
-} // namespace kfr
diff --git a/include/kfr/dft/cache.hpp b/include/kfr/dft/cache.hpp
@@ -32,6 +32,8 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
template <typename T>
using dft_plan_ptr = std::shared_ptr<const dft_plan<T>>;
@@ -166,4 +168,5 @@ univector<T> irealdft(const univector<complex<T>, Tag>& input)
dft->execute(output, input, temp);
return output;
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dft
+/** @addtogroup convolution
* @{
*/
/*
@@ -25,12 +25,12 @@
*/
#pragma once
-#include "../base/complex.hpp"
-#include "../base/constants.hpp"
#include "../base/filter.hpp"
#include "../base/memory.hpp"
-#include "../base/read_write.hpp"
-#include "../base/vec.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/constants.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/vec.hpp"
#include "cache.hpp"
#include "fft.hpp"
@@ -42,8 +42,10 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
-namespace internal
+namespace intrinsics
{
template <typename T>
univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
@@ -51,27 +53,27 @@ template <typename T>
univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2);
template <typename T>
univector<T> autocorrelate(const univector_ref<const T>& src1);
-} // namespace internal
+} // namespace intrinsics
/// @brief Convolution
template <typename T, univector_tag Tag1, univector_tag Tag2>
univector<T> convolve(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
{
- return internal::convolve(src1.slice(), src2.slice());
+ return intrinsics::convolve(src1.slice(), src2.slice());
}
/// @brief Correlation
template <typename T, univector_tag Tag1, univector_tag Tag2>
univector<T> correlate(const univector<T, Tag1>& src1, const univector<T, Tag2>& src2)
{
- return internal::correlate(src1.slice(), src2.slice());
+ return intrinsics::correlate(src1.slice(), src2.slice());
}
/// @brief Auto-correlation
template <typename T, univector_tag Tag1>
univector<T> autocorrelate(const univector<T, Tag1>& src)
{
- return internal::autocorrelate(src.slice());
+ return intrinsics::autocorrelate(src.slice());
}
/// @brief Convolution using Filter API
@@ -91,12 +93,12 @@ protected:
}
void process_buffer(T* output, const T* input, size_t size) final;
+ const size_t size;
+ const size_t block_size;
const dft_plan_real<T> fft;
univector<u8> temp;
std::vector<univector<complex<T>>> segments;
std::vector<univector<complex<T>>> ir_segments;
- const size_t size;
- const size_t block_size;
size_t input_position;
univector<T> saved_input;
univector<complex<T>> premul;
@@ -105,6 +107,6 @@ protected:
univector<T> overlap;
size_t position;
};
-
+} // namespace CMT_ARCH_NAME
} // namespace kfr
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/data/bitrev.hpp b/include/kfr/dft/data/bitrev.hpp
diff --git a/include/kfr/dft/data/sincos.hpp b/include/kfr/dft/data/sincos.hpp
@@ -0,0 +1,192 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../kfr.h"
+#include "../../simd/types.hpp"
+#include <cstdint>
+
+namespace kfr
+{
+
+namespace data
+{
+
+template <typename T>
+constexpr T c_sin_table[65] = {
+ /* sin(2*pi* 0/ 256) */ f32(0.0),
+ /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547),
+ /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475),
+ /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133),
+ /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114),
+ /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752),
+ /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062),
+ /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663),
+ /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277),
+ /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484),
+ /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321),
+ /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421),
+ /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915),
+ /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934),
+ /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778),
+ /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023),
+ /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613),
+ /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119),
+ /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046),
+ /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832),
+ /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575),
+ /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397),
+ /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608),
+ /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603),
+ /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749),
+ /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776),
+ /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119),
+ /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594),
+ /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757),
+ /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769),
+ /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288),
+ /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455),
+ /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848),
+ /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831),
+ /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729),
+ /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404),
+ /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971),
+ /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565),
+ /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796),
+ /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471),
+ /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386),
+ /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198),
+ /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425),
+ /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108),
+ /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084),
+ /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006),
+ /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101),
+ /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447),
+ /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224),
+ /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295),
+ /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856),
+ /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522),
+ /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828),
+ /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631),
+ /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569),
+ /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644),
+ /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739),
+ /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601),
+ /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837),
+ /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108),
+ /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755),
+ /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717),
+ /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432),
+ /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501),
+ /* sin(2*pi* 64/ 256) */ f32(1.0000000000000000000000000000000000000000)
+};
+
+// data generated by mpfr
+template <>
+constexpr f64 c_sin_table<f64>[65] = {
+ /* sin(2*pi* 0/ 256) */ f64(0.0),
+ /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547),
+ /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475),
+ /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133),
+ /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114),
+ /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752),
+ /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062),
+ /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663),
+ /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277),
+ /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484),
+ /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321),
+ /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421),
+ /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915),
+ /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934),
+ /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778),
+ /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023),
+ /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613),
+ /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119),
+ /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046),
+ /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832),
+ /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575),
+ /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397),
+ /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608),
+ /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603),
+ /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749),
+ /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776),
+ /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119),
+ /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594),
+ /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757),
+ /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769),
+ /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288),
+ /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455),
+ /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848),
+ /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831),
+ /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729),
+ /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404),
+ /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971),
+ /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565),
+ /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796),
+ /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471),
+ /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386),
+ /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198),
+ /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425),
+ /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108),
+ /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084),
+ /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006),
+ /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101),
+ /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447),
+ /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224),
+ /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295),
+ /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856),
+ /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522),
+ /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828),
+ /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631),
+ /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569),
+ /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644),
+ /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739),
+ /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601),
+ /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837),
+ /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108),
+ /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755),
+ /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717),
+ /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432),
+ /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501),
+ /* sin(2*pi* 64/ 256) */ f64(1.0000000000000000000000000000000000000000)
+};
+
+} // namespace data
+
+template <typename T>
+constexpr inline T sin_using_table_256(size_t k)
+{
+ return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128];
+}
+
+template <typename T>
+constexpr inline T sin_using_table(size_t size, size_t k)
+{
+ return sin_using_table_256<T>((k * 256 / size) % 256);
+}
+template <typename T>
+constexpr inline T cos_using_table(size_t size, size_t k)
+{
+ return sin_using_table<T>(size, k + size / 4);
+}
+} // namespace kfr
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -25,13 +25,13 @@
*/
#pragma once
-#include "../base/complex.hpp"
-#include "../base/constants.hpp"
#include "../base/memory.hpp"
-#include "../base/read_write.hpp"
#include "../base/small_buffer.hpp"
#include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/constants.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/vec.hpp"
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wshadow")
@@ -57,9 +57,12 @@ enum class dft_type
enum class dft_order
{
normal,
- internal, // possibly bit/digit-reversed, implementation-defined, faster
+ internal, // possibly bit/digit-reversed, implementation-defined, faster to compute
};
+inline namespace CMT_ARCH_NAME
+{
+
template <typename T>
struct dft_stage;
@@ -76,7 +79,8 @@ struct dft_plan
void dump() const;
- KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const
+ KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
+ bool inverse = false) const
{
if (inverse)
execute_dft(ctrue, out, in, temp);
@@ -85,14 +89,15 @@ struct dft_plan
}
~dft_plan();
template <bool inverse>
- KFR_INTRIN void execute(complex<T>* out, const complex<T>* in, u8* temp, cbool_t<inverse> inv) const
+ KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
+ cbool_t<inverse> inv) const
{
execute_dft(inv, out, in, temp);
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
- KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
- univector<u8, Tag3>& temp, bool inverse = false) const
+ KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
+ univector<u8, Tag3>& temp, bool inverse = false) const
{
if (inverse)
execute_dft(ctrue, out.data(), in.data(), temp.data());
@@ -100,8 +105,8 @@ struct dft_plan
execute_dft(cfalse, out.data(), in.data(), temp.data());
}
template <bool inverse, univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
- KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
- univector<u8, Tag3>& temp, cbool_t<inverse> inv) const
+ KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<complex<T>, Tag2>& in,
+ univector<u8, Tag3>& temp, cbool_t<inverse> inv) const
{
execute_dft(inv, out.data(), in.data(), temp.data());
}
@@ -128,6 +133,9 @@ protected:
const complex<T>* select_in(size_t stage, const complex<T>* out, const complex<T>* in,
const complex<T>* scratch, bool in_scratch) const;
complex<T>* select_out(size_t stage, complex<T>* out, complex<T>* scratch) const;
+
+ void init_dft(size_t size, dft_order order);
+ void init_fft(size_t size, dft_order order);
};
enum class dft_pack_format
@@ -155,14 +163,14 @@ struct dft_plan_real : dft_plan<T>
void execute(univector<complex<T>, Tag1>&, const univector<complex<T>, Tag2>&, univector<u8, Tag3>&,
cbool_t<inverse>) const = delete;
- KFR_INTRIN void execute(complex<T>* out, const T* in, u8* temp,
- dft_pack_format fmt = dft_pack_format::CCs) const
+ KFR_MEM_INTRINSIC void execute(complex<T>* out, const T* in, u8* temp,
+ dft_pack_format fmt = dft_pack_format::CCs) const
{
this->execute_dft(cfalse, out, ptr_cast<complex<T>>(in), temp);
to_fmt(out, fmt);
}
- KFR_INTRIN void execute(T* out, const complex<T>* in, u8* temp,
- dft_pack_format fmt = dft_pack_format::CCs) const
+ KFR_MEM_INTRINSIC void execute(T* out, const complex<T>* in, u8* temp,
+ dft_pack_format fmt = dft_pack_format::CCs) const
{
complex<T>* outdata = ptr_cast<complex<T>>(out);
from_fmt(outdata, in, fmt);
@@ -170,15 +178,17 @@ struct dft_plan_real : dft_plan<T>
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
- KFR_INTRIN void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in,
- univector<u8, Tag3>& temp, dft_pack_format fmt = dft_pack_format::CCs) const
+ KFR_MEM_INTRINSIC void execute(univector<complex<T>, Tag1>& out, const univector<T, Tag2>& in,
+ univector<u8, Tag3>& temp,
+ dft_pack_format fmt = dft_pack_format::CCs) const
{
this->execute_dft(cfalse, out.data(), ptr_cast<complex<T>>(in.data()), temp.data());
to_fmt(out.data(), fmt);
}
template <univector_tag Tag1, univector_tag Tag2, univector_tag Tag3>
- KFR_INTRIN void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
- univector<u8, Tag3>& temp, dft_pack_format fmt = dft_pack_format::CCs) const
+ KFR_MEM_INTRINSIC void execute(univector<T, Tag1>& out, const univector<complex<T>, Tag2>& in,
+ univector<u8, Tag3>& temp,
+ dft_pack_format fmt = dft_pack_format::CCs) const
{
complex<T>* outdata = ptr_cast<complex<T>>(out.data());
from_fmt(outdata, in.data(), fmt);
@@ -230,6 +240,7 @@ void fft_multiply_accumulate(univector<complex<T>, Tag1>& dest, const univector<
if (fmt == dft_pack_format::Perm)
dest[0] = f0;
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp
@@ -25,19 +25,21 @@
*/
#pragma once
-#include "../../base/complex.hpp"
-#include "../../base/constants.hpp"
-#include "../../base/digitreverse.hpp"
-#include "../../base/vec.hpp"
+#include "../../simd/complex.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/digitreverse.hpp"
+#include "../../simd/vec.hpp"
-#include "../../data/bitrev.hpp"
+#include "../data/bitrev.hpp"
#include "ft.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
-namespace internal
+namespace intrinsics
{
constexpr bool fft_reorder_aligned = false;
@@ -74,7 +76,7 @@ CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits)
}
template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(T* inout, size_t i)
+KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i)
{
using cxx = cvec<T, 16>;
constexpr size_t N = 1 << log2n;
@@ -86,7 +88,7 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i)
}
template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j)
+KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j)
{
CMT_ASSUME(i != j);
using cxx = cvec<T, 16>;
@@ -103,7 +105,7 @@ KFR_INTRIN void fft_reorder_swap_two(T* inout, size_t i, size_t j)
}
template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j)
+KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j)
{
CMT_ASSUME(i != j);
using cxx = cvec<T, 16>;
@@ -120,25 +122,25 @@ KFR_INTRIN void fft_reorder_swap(T* inout, size_t i, size_t j)
}
template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(complex<T>* inout, size_t i)
+KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i)
{
fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2);
}
template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1)
+KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1)
{
fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2);
}
template <size_t log2n, size_t bitrev, typename T>
-KFR_INTRIN void fft_reorder_swap(complex<T>* inout, size_t i, size_t j)
+KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j)
{
fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2);
}
template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<11>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>)
{
fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4);
fft_reorder_swap<11>(inout, 1 * 4, 64 * 4);
@@ -207,7 +209,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<11>)
}
template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<7>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>)
{
constexpr size_t bitrev = 2;
fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4);
@@ -217,7 +219,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<7>)
}
template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<8>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>)
{
constexpr size_t bitrev = 4;
fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4);
@@ -231,7 +233,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<8>)
}
template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>)
{
constexpr size_t bitrev = 2;
fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4);
@@ -253,14 +255,14 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, csize_t<9>)
}
template <typename T, bool use_br2>
-void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
+KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>)
{
cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4,
digitreverse<(use_br2 ? 2 : 4), 2>(value));
}
template <typename T, bool use_br2>
-KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>)
+KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>)
{
CMT_ASSUME(i != j);
const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4);
@@ -270,7 +272,7 @@ KFR_INTRIN void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbo
}
template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
{
const size_t N = 1 << log2n;
const size_t N4 = N / 4;
@@ -305,7 +307,7 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2)
}
template <typename T>
-KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
+KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
{
const size_t N = size_t(1) << log2n;
const size_t N4 = N / 4;
@@ -386,5 +388,6 @@ KFR_INTRIN void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2)
i += istep;
}
}
-} // namespace internal
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp
@@ -27,8 +27,10 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
-namespace internal
+namespace intrinsics
{
template <typename T>
@@ -76,18 +78,19 @@ univector<T> autocorrelate(const univector_ref<const T>& src1)
return result;
}
-} // namespace internal
+} // namespace intrinsics
template <typename T>
convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
- : fft(2 * next_poweroftwo(block_size)), size(size), block_size(block_size), temp(fft.temp_size),
+ : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size)), temp(fft.temp_size),
segments((size + block_size - 1) / block_size)
+
{
}
template <typename T>
convolve_filter<T>::convolve_filter(const univector<T>& data, size_t block_size)
- : fft(2 * next_poweroftwo(block_size)), size(data.size()), block_size(next_poweroftwo(block_size)),
+ : size(data.size()), block_size(next_poweroftwo(block_size)), fft(2 * next_poweroftwo(block_size)),
temp(fft.temp_size),
segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
@@ -124,8 +127,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
while (processed < size)
{
const size_t processing = std::min(size - processed, block_size - input_position);
- internal::builtin_memcpy(saved_input.data() + input_position, input + processed,
- processing * sizeof(T));
+ builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T));
process(scratch, padded(saved_input));
fft.execute(segments[position], scratch, temp, dft_pack_format::Perm);
@@ -152,7 +154,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
input_position = 0;
process(saved_input, zeros());
- internal::builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
+ builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
position = position > 0 ? position - 1 : segments.size() - 1;
}
@@ -161,7 +163,7 @@ void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
}
}
-namespace internal
+namespace intrinsics
{
template univector<float> convolve<float>(const univector_ref<const float>&,
@@ -171,7 +173,7 @@ template univector<float> correlate<float>(const univector_ref<const float>&,
template univector<float> autocorrelate<float>(const univector_ref<const float>&);
-} // namespace internal
+} // namespace intrinsics
template convolve_filter<float>::convolve_filter(size_t, size_t);
@@ -181,7 +183,7 @@ template void convolve_filter<float>::set_data(const univector<float>&);
template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
-namespace internal
+namespace intrinsics
{
template univector<double> convolve<double>(const univector_ref<const double>&,
@@ -191,7 +193,7 @@ template univector<double> correlate<double>(const univector_ref<const double>&,
template univector<double> autocorrelate<double>(const univector_ref<const double>&);
-} // namespace internal
+} // namespace intrinsics
template convolve_filter<double>::convolve_filter(size_t, size_t);
@@ -200,5 +202,5 @@ template convolve_filter<double>::convolve_filter(const univector<double>&, size
template void convolve_filter<double>::set_data(const univector<double>&);
template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
-
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -0,0 +1,123 @@
+/** @addtogroup dft
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../dft_c.h"
+
+#include "../../base/basic_expressions.hpp"
+#include "../../math/complex_math.hpp"
+#include "../../testo/assert.hpp"
+#include "../cache.hpp"
+#include "../fft.hpp"
+#include "bitrev.hpp"
+#include "ft.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+#define DFT_ASSERT TESTO_ASSERT_INACTIVE
+
+template <typename T>
+constexpr size_t fft_vector_width = vector_width<T>;
+
+using cdirect_t = cfalse_t;
+using cinvert_t = ctrue_t;
+
+template <typename T>
+struct dft_stage
+{
+ size_t radix = 0;
+ size_t stage_size = 0;
+ size_t data_size = 0;
+ size_t temp_size = 0;
+ u8* data = nullptr;
+ size_t repeats = 1;
+ size_t out_offset = 0;
+ size_t blocks = 0;
+ const char* name = nullptr;
+ bool recursion = false;
+ bool can_inplace = true;
+ bool inplace = false;
+ bool to_scratch = false;
+ bool need_reorder = true;
+
+ void initialize(size_t size) { do_initialize(size); }
+
+ virtual void dump() const
+ {
+ printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
+ stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
+ to_scratch);
+ }
+
+ KFR_MEM_INTRINSIC void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ do_execute(cdirect_t(), out, in, temp);
+ }
+ KFR_MEM_INTRINSIC void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ do_execute(cinvert_t(), out, in, temp);
+ }
+ virtual ~dft_stage() {}
+
+protected:
+ virtual void do_initialize(size_t) {}
+ virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+ virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
+};
+
+#define DFT_STAGE_FN \
+ void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \
+ { \
+ return do_execute<false>(out, in, temp); \
+ } \
+ void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override \
+ { \
+ return do_execute<true>(out, in, temp); \
+ }
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wassume")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
+#endif
+
+template <typename T>
+template <typename Stage, typename... Args>
+void dft_plan<T>::add_stage(Args... args)
+{
+ dft_stage<T>* stage = new Stage(args...);
+ stage->need_reorder = need_reorder;
+ this->data_size += stage->data_size;
+ this->temp_size += stage->temp_size;
+ stages.push_back(dft_stage_ptr(stage));
+}
+
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp
@@ -23,20 +23,17 @@
disclosing the source code of your own applications.
See https://www.kfrlib.com for details.
*/
+#pragma once
-#include "../dft_c.h"
-
-#include "../../base/basic_expressions.hpp"
-#include "../../testo/assert.hpp"
-#include "../cache.hpp"
-#include "../fft.hpp"
-#include "bitrev.hpp"
-#include "ft.hpp"
+#include "dft-fft.hpp"
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wshadow")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
#endif
+#if CMT_HAS_WARNING("-Wunused-lambda-capture")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture")
+#endif
CMT_PRAGMA_MSVC(warning(push))
CMT_PRAGMA_MSVC(warning(disable : 4100))
@@ -44,439 +41,15 @@ CMT_PRAGMA_MSVC(warning(disable : 4100))
namespace kfr
{
-constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{};
-
-#define DFT_ASSERT TESTO_ASSERT_INACTIVE
-
-template <typename T>
-constexpr size_t fft_vector_width = platform<T>::vector_width;
-
-using cdirect_t = cfalse_t;
-using cinvert_t = ctrue_t;
-
-template <typename T>
-struct dft_stage
-{
- size_t radix = 0;
- size_t stage_size = 0;
- size_t data_size = 0;
- size_t temp_size = 0;
- u8* data = nullptr;
- size_t repeats = 1;
- size_t out_offset = 0;
- size_t blocks = 0;
- const char* name = nullptr;
- bool recursion = false;
- bool can_inplace = true;
- bool inplace = false;
- bool to_scratch = false;
- bool need_reorder = true;
-
- void initialize(size_t size) { do_initialize(size); }
-
- virtual void dump() const
- {
- printf("%s: \n\t%5zu,%5zu,%5zu,%5zu,%5zu,%5zu,%5zu, %d, %d, %d, %d\n", name ? name : "unnamed", radix,
- stage_size, data_size, temp_size, repeats, out_offset, blocks, recursion, can_inplace, inplace,
- to_scratch);
- }
-
- KFR_INTRIN void execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp)
- {
- do_execute(cdirect_t(), out, in, temp);
- }
- KFR_INTRIN void execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp)
- {
- do_execute(cinvert_t(), out, in, temp);
- }
- virtual ~dft_stage() {}
-
-protected:
- virtual void do_initialize(size_t) {}
- virtual void do_execute(cdirect_t, complex<T>*, const complex<T>*, u8* temp) = 0;
- virtual void do_execute(cinvert_t, complex<T>*, const complex<T>*, u8* temp) = 0;
-};
-
-#define DFT_STAGE_FN \
- void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \
- { \
- return do_execute<false>(out, in, temp); \
- } \
- void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override \
- { \
- return do_execute<true>(out, in, temp); \
- }
-
-CMT_PRAGMA_GNU(GCC diagnostic push)
-#if CMT_HAS_WARNING("-Wassume")
-CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume")
-#endif
-
-namespace internal
-{
-
-template <size_t width, bool inverse, typename T>
-KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>,
- const cvec<T, width>& w, const cvec<T, width>& tw)
-{
- cvec<T, width> ww = w;
- cvec<T, width> tw_ = tw;
- cvec<T, width> b1 = ww * dupeven(tw_);
- ww = swap<2>(ww);
-
- if (inverse)
- tw_ = -(tw_);
- ww = subadd(b1, ww * dupodd(tw_));
- return ww;
-}
-
-template <size_t width, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>,
- cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in,
- const complex<T>* twiddle)
-{
- const size_t N4 = N / 4;
- cvec<T, width> w1, w2, w3;
-
- cvec<T, width> sum02, sum13, diff02, diff13;
-
- cvec<T, width> a0, a1, a2, a3;
- a0 = cread<width, aligned>(in + 0);
- a2 = cread<width, aligned>(in + N4 * 2);
- sum02 = a0 + a2;
-
- a1 = cread<width, aligned>(in + N4);
- a3 = cread<width, aligned>(in + N4 * 3);
- sum13 = a1 + a3;
-
- cwrite<width, aligned>(out, sum02 + sum13);
- w2 = sum02 - sum13;
- cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2),
- radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2,
- cread<width, true>(twiddle + width)));
- diff02 = a0 - a2;
- diff13 = a1 - a3;
- if (inverse)
- {
- diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
- diff13 = swap<2>(diff13);
- }
- else
- {
- diff13 = swap<2>(diff13);
- diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
- }
-
- w1 = diff02 + diff13;
-
- cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1),
- radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1,
- cread<width, true>(twiddle + 0)));
- w3 = diff02 - diff13;
- cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(),
- w3, cread<width, true>(twiddle + width * 2)));
-}
-
-template <size_t width, bool inverse, typename T>
-KFR_SINTRIN cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>,
- const cvec<T, width>& w, const cvec<T, width>& tw)
-{
- vec<T, width> re1, im1, twre, twim;
- split(w, re1, im1);
- split(tw, twre, twim);
-
- const vec<T, width> b1re = re1 * twre;
- const vec<T, width> b1im = im1 * twre;
- if (inverse)
- return concat(b1re + im1 * twim, b1im - re1 * twim);
- else
- return concat(b1re - im1 * twim, b1im + re1 * twim);
-}
-
-template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>,
- cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
- const complex<T>* in, const complex<T>* twiddle)
-{
- const size_t N4 = N / 4;
- cvec<T, width> w1, w2, w3;
- constexpr bool read_split = !splitin && splitout;
- constexpr bool write_split = splitin && !splitout;
-
- vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
-
- split(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
- split(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
- split(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
- split(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
-
- const vec<T, width> sum02re = re0 + re2;
- const vec<T, width> sum02im = im0 + im2;
- const vec<T, width> sum13re = re1 + re3;
- const vec<T, width> sum13im = im1 + im3;
-
- cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im));
- w2 = concat(sum02re - sum13re, sum02im - sum13im);
- cwrite_split<width, aligned, write_split>(
- out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2,
- cread<width, true>(twiddle + width)));
-
- const vec<T, width> diff02re = re0 - re2;
- const vec<T, width> diff02im = im0 - im2;
- const vec<T, width> diff13re = re1 - re3;
- const vec<T, width> diff13im = im1 - im3;
-
- (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
- (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
-
- cwrite_split<width, aligned, write_split>(
- out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1,
- cread<width, true>(twiddle + 0)));
- cwrite_split<width, aligned, write_split>(
- out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3,
- cread<width, true>(twiddle + width * 2)));
-}
-
-template <typename T>
-CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
-{
- if (n == 0)
- {
- return make_vector(static_cast<T>(1), static_cast<T>(0));
- }
- else if (n == size / 4)
- {
- return make_vector(static_cast<T>(0), static_cast<T>(-1));
- }
- else if (n == size / 2)
- {
- return make_vector(static_cast<T>(-1), static_cast<T>(0));
- }
- else if (n == size * 3 / 4)
- {
- return make_vector(static_cast<T>(0), static_cast<T>(1));
- }
- else
- {
- fbase kth = c_pi<fbase, 2> * (n / static_cast<fbase>(size));
- fbase tcos = +kfr::cos(kth);
- fbase tsin = -kfr::sin(kth);
- return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
- }
-}
-
-template <typename T, size_t width>
-KFR_SINTRIN void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size,
- bool split_format)
-{
- vec<T, 2 * width> result = T();
- CMT_LOOP_UNROLL
- for (size_t i = 0; i < width; i++)
- {
- const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size);
- result[i * 2] = r[0];
- result[i * 2 + 1] = r[1];
- }
- if (split_format)
- ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result);
- else
- ref_cast<cvec<T, width>>(twiddle[0]) = result;
- twiddle += width;
-}
-
-template <typename T, size_t width>
-CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
-{
- const size_t count = stage_size / 4;
- size_t nnstep = size / stage_size;
- DFT_ASSERT(width <= count);
- CMT_LOOP_NOUNROLL
- for (size_t n = 0; n < count; n += width)
- {
- initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format);
- initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format);
- initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format);
- }
-}
-
-#if defined CMT_ARCH_SSE
-#ifdef CMT_COMPILER_GNU
-#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
-#else
-#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0);
-#endif
-#else
-#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr));
-#endif
-
-template <typename T>
-KFR_SINTRIN void prefetch_one(const complex<T>* in)
-{
- KFR_PREFETCH(in);
-}
-
-template <typename T>
-KFR_SINTRIN void prefetch_four(size_t stride, const complex<T>* in)
-{
- KFR_PREFETCH(in);
- KFR_PREFETCH(in + stride);
- KFR_PREFETCH(in + stride * 2);
- KFR_PREFETCH(in + stride * 3);
-}
-
-template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
- bool inverse, bool aligned, typename T>
-KFR_SINTRIN cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>, cbool_t<splitin>,
- cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
- complex<T>* out, const complex<T>* in, const complex<T>*& twiddle)
-{
- constexpr static size_t prefetch_offset = width * 8;
- const auto N4 = N / csize_t<4>();
- const auto N43 = N4 * csize_t<3>();
- CMT_ASSUME(blocks > 0);
- CMT_ASSUME(N > 0);
- CMT_ASSUME(N4 > 0);
- DFT_ASSERT(width <= N4);
- CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
- {
- CMT_PRAGMA_CLANG(clang loop unroll_count(2))
- for (size_t n2 = 0; n2 < N4; n2 += width)
- {
- if (prefetch)
- prefetch_four(N4, in + prefetch_offset);
- radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
- cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
- in, twiddle + n2 * 3);
- in += width;
- out += width;
- }
- in += N43;
- out += N43;
- }
- twiddle += N43;
- return {};
-}
-
-template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>,
- cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
- complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+inline namespace CMT_ARCH_NAME
{
- CMT_ASSUME(blocks > 0);
- constexpr static size_t prefetch_offset = 32 * 4;
- for (size_t b = 0; b < blocks; b++)
- {
- if (prefetch)
- prefetch_four(csize_t<64>(), out + prefetch_offset);
- cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
- split(cread_split<8, aligned, splitin>(out + 0), w0, w1);
- split(cread_split<8, aligned, splitin>(out + 8), w2, w3);
- split(cread_split<8, aligned, splitin>(out + 16), w4, w5);
- split(cread_split<8, aligned, splitin>(out + 24), w6, w7);
-
- butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
-
- w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
- w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
- w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
- w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
- w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
- w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
- w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
-
- cvec<T, 8> z0, z1, z2, z3;
- transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
-
- butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
- cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3)));
- out += 32;
- }
- return {};
-}
-
-template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
- cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
- complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
-{
- CMT_ASSUME(blocks > 0);
- DFT_ASSERT(2 <= blocks);
- constexpr static size_t prefetch_offset = width * 16;
- for (size_t b = 0; b < blocks; b += 2)
- {
- if (prefetch)
- prefetch_one(out + prefetch_offset);
-
- cvec<T, 8> vlo = cread<8, aligned>(out + 0);
- cvec<T, 8> vhi = cread<8, aligned>(out + 8);
- butterfly8<inverse>(vlo);
- butterfly8<inverse>(vhi);
- vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo);
- vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi);
- cwrite<8, aligned>(out, vlo);
- cwrite<8, aligned>(out + 8, vhi);
- out += 16;
- }
- return {};
-}
-
-template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
- cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
- complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
-{
- CMT_ASSUME(blocks > 0);
- constexpr static size_t prefetch_offset = width * 4;
- DFT_ASSERT(2 <= blocks);
- CMT_PRAGMA_CLANG(clang loop unroll_count(2))
- for (size_t b = 0; b < blocks; b += 2)
- {
- if (prefetch)
- prefetch_one(out + prefetch_offset);
-
- cvec<T, 16> vlo = cread<16, aligned>(out);
- cvec<T, 16> vhi = cread<16, aligned>(out + 16);
- butterfly4<4, inverse>(vlo);
- butterfly4<4, inverse>(vhi);
- apply_twiddles4<0, 4, 4, inverse>(vlo);
- apply_twiddles4<0, 4, 4, inverse>(vhi);
- vlo = digitreverse4<2>(vlo);
- vhi = digitreverse4<2>(vhi);
- butterfly4<4, inverse>(vlo);
- butterfly4<4, inverse>(vhi);
-
- use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
- use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
- out += 32;
- }
- return {};
-}
+constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{};
-template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
-KFR_SINTRIN ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
- cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
- complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+namespace intrinsics
{
- constexpr static size_t prefetch_offset = width * 4;
- CMT_ASSUME(blocks > 0);
- DFT_ASSERT(4 <= blocks);
- CMT_LOOP_NOUNROLL
- for (size_t b = 0; b < blocks; b += 4)
- {
- if (prefetch)
- prefetch_one(out + prefetch_offset);
-
- cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
- butterfly4<4, inverse>(v16);
- cdigitreverse4_write<aligned>(out, v16);
-
- out += 4 * 4;
- }
- return {};
-}
template <typename T>
-static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
+void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
{
complex<T>* twiddle = ptr_cast<complex<T>>(stage->data);
const size_t N = stage->repeats * stage->radix;
@@ -507,7 +80,7 @@ static void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width)
template <typename T, size_t fixed_radix>
struct dft_stage_fixed_impl : dft_stage<T>
{
- dft_stage_fixed_impl(size_t radix_, size_t iterations, size_t blocks)
+ dft_stage_fixed_impl(size_t, size_t iterations, size_t blocks)
{
this->name = type_name<decltype(*this)>();
this->radix = fixed_radix;
@@ -523,11 +96,11 @@ struct dft_stage_fixed_impl : dft_stage<T>
constexpr static size_t width = fixed_radix >= 7
? fft_vector_width<T> / 2
: fixed_radix >= 4 ? fft_vector_width<T> : fft_vector_width<T> * 2;
- virtual void do_initialize(size_t size) override final { dft_stage_fixed_initialize(this, width); }
+ virtual void do_initialize(size_t) override final { dft_stage_fixed_initialize(this, width); }
DFT_STAGE_FN
template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
const size_t Nord = this->repeats;
const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
@@ -546,7 +119,7 @@ struct dft_stage_fixed_impl : dft_stage<T>
template <typename T, size_t fixed_radix>
struct dft_stage_fixed_final_impl : dft_stage<T>
{
- dft_stage_fixed_final_impl(size_t radix_, size_t iterations, size_t blocks)
+ dft_stage_fixed_final_impl(size_t, size_t iterations, size_t blocks)
{
this->name = type_name<decltype(*this)>();
this->radix = fixed_radix;
@@ -561,10 +134,9 @@ struct dft_stage_fixed_final_impl : dft_stage<T>
DFT_STAGE_FN
template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
- const size_t b = this->blocks;
- const size_t size = b * fixed_radix;
+ const size_t b = this->blocks;
butterflies(b, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, b);
}
@@ -584,27 +156,32 @@ inline auto apply_conj(E& e, ctrue_t)
/// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1]
template <typename E>
-struct fft_inverse : expression_base<E>
+struct fft_inverse : internal::expression_with_arguments<E>
{
using value_type = value_type_of<E>;
- CMT_INLINE fft_inverse(E&& expr) noexcept : expression_base<E>(std::forward<E>(expr)) {}
+ KFR_MEM_INTRINSIC fft_inverse(E&& expr) CMT_NOEXCEPT
+ : internal::expression_with_arguments<E>(std::forward<E>(expr))
+ {
+ }
- CMT_INLINE vec<value_type, 1> operator()(cinput_t input, size_t index, vec_t<value_type, 1>) const
+ friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, cinput_t input,
+ size_t index, vec_shape<value_type, 1>)
{
- return this->argument_first(input, index == 0 ? 0 : this->size() - index, vec_t<value_type, 1>());
+ return self.argument_first(input, index == 0 ? 0 : self.size() - index, vec_shape<value_type, 1>());
}
template <size_t N>
- CMT_INLINE vec<value_type, N> operator()(cinput_t input, size_t index, vec_t<value_type, N>) const
+ friend KFR_MEM_INTRINSIC vec<value_type, N> get_elements(const fft_inverse& self, cinput_t input,
+ size_t index, vec_shape<value_type, N>)
{
if (index == 0)
{
return concat(
- this->argument_first(input, index, vec_t<value_type, 1>()),
- reverse(this->argument_first(input, this->size() - (N - 1), vec_t<value_type, N - 1>())));
+ self.argument_first(input, index, vec_shape<value_type, 1>()),
+ reverse(self.argument_first(input, self.size() - (N - 1), vec_shape<value_type, N - 1>())));
}
- return reverse(this->argument_first(input, this->size() - index - (N - 1), vec_t<value_type, N>()));
+ return reverse(self.argument_first(input, self.size() - index - (N - 1), vec_shape<value_type, N>()));
}
};
@@ -618,7 +195,7 @@ template <typename T>
struct dft_arblen_stage_impl : dft_stage<T>
{
dft_arblen_stage_impl(size_t size)
- : fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal), size(size)
+ : size(size), fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal)
{
this->name = type_name<decltype(*this)>();
this->radix = size;
@@ -642,10 +219,9 @@ struct dft_arblen_stage_impl : dft_stage<T>
DFT_STAGE_FN
template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
{
- const size_t n = this->size;
- const size_t N2 = this->fftsize;
+ const size_t n = this->size;
auto&& chirp = apply_conj(chirp_, cbool<inverse>);
@@ -703,7 +279,7 @@ struct dft_special_stage_impl : dft_stage<T>
}
DFT_STAGE_FN
template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
{
complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size);
stage1.do_execute(cbool<inverse>, scratch, in, temp);
@@ -730,7 +306,7 @@ struct dft_stage_generic_impl : dft_stage<T>
}
protected:
- virtual void do_initialize(size_t size) override final
+ virtual void do_initialize(size_t) override final
{
complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
CMT_LOOP_NOUNROLL
@@ -746,12 +322,10 @@ protected:
DFT_STAGE_FN
template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
{
const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
const size_t bl = this->blocks;
- const size_t Nord = this->repeats;
- const size_t N = Nord * this->radix;
CMT_LOOP_NOUNROLL
for (size_t b = 0; b < bl; b++)
@@ -848,7 +422,7 @@ protected:
DFT_STAGE_FN
template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
cswitch(dft_radices, radices[0],
[&](auto first_radix) {
@@ -883,441 +457,7 @@ protected:
});
}
};
-
-template <typename T, bool splitin, bool is_even>
-struct fft_stage_impl : dft_stage<T>
-{
- fft_stage_impl(size_t stage_size)
- {
- this->name = type_name<decltype(*this)>();
- this->radix = 4;
- this->stage_size = stage_size;
- this->repeats = 4;
- this->recursion = true;
- this->data_size =
- align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
- }
-
-protected:
- constexpr static bool prefetch = true;
- constexpr static bool aligned = false;
- constexpr static size_t width = fft_vector_width<T>;
-
- virtual void do_initialize(size_t size) override final
- {
- complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- initialize_twiddles<T, width>(twiddle, this->stage_size, size, true);
- }
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- if (splitin)
- in = out;
- const size_t stg_size = this->stage_size;
- CMT_ASSUME(stg_size >= 2048);
- CMT_ASSUME(stg_size % 2048 == 0);
- radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
- }
-};
-
-template <typename T, bool splitin, size_t size>
-struct fft_final_stage_impl : dft_stage<T>
-{
- fft_final_stage_impl(size_t)
- {
- this->name = type_name<decltype(*this)>();
- this->radix = size;
- this->stage_size = size;
- this->out_offset = size;
- this->repeats = 4;
- this->recursion = true;
- this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
- }
-
-protected:
- constexpr static size_t width = fft_vector_width<T>;
- constexpr static bool is_even = cometa::is_even(ilog2(size));
- constexpr static bool use_br2 = !is_even;
- constexpr static bool aligned = false;
- constexpr static bool prefetch = splitin;
-
- KFR_INTRIN void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
- KFR_INTRIN void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
-
- template <size_t N, bool pass_splitin>
- KFR_INTRIN void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>, complex<T>*& twiddle)
- {
- constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width;
- constexpr size_t pass_width = const_min(width, N / 4);
- initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_split || pass_splitin);
- init_twiddles(csize<N / 4>, total_size, cbool<pass_split>, twiddle);
- }
-
- virtual void do_initialize(size_t total_size) override final
- {
- complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle);
- }
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
- }
-
- template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
- KFR_INTRIN void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
- {
- radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-
- template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
- KFR_INTRIN void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
- {
- radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-
- template <bool inverse>
- KFR_INTRIN void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
- {
- radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-
- template <bool inverse>
- KFR_INTRIN void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
- const complex<T>*& twiddle)
- {
- radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-
- template <bool inverse, size_t N, bool pass_splitin>
- KFR_INTRIN void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
- const complex<T>* in, const complex<T>*& twiddle)
- {
- static_assert(N > 8, "");
- constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width;
- constexpr size_t pass_width = const_min(width, N / 4);
- static_assert(pass_width == width || (pass_split == pass_splitin), "");
- static_assert(pass_width <= N / 4, "");
- radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_split>, cbool_t<pass_splitin>(),
- cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in,
- twiddle);
- final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_split>, out, out, twiddle);
- }
-};
-
-template <typename T, bool is_even>
-struct fft_reorder_stage_impl : dft_stage<T>
-{
- fft_reorder_stage_impl(size_t stage_size)
- {
- this->name = type_name<decltype(*this)>();
- this->stage_size = stage_size;
- log2n = ilog2(stage_size);
- this->data_size = 0;
- }
-
-protected:
- size_t log2n;
-
- virtual void do_initialize(size_t) override final {}
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- fft_reorder(out, log2n, cbool_t<!is_even>());
- }
-};
-
-template <typename T, size_t log2n>
-struct fft_specialization;
-
-template <typename T>
-struct fft_specialization<T, 1> : dft_stage<T>
-{
- fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
- constexpr static bool aligned = false;
- DFT_STAGE_FN
-
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- cvec<T, 1> a0, a1;
- split(cread<2, aligned>(in), a0, a1);
- cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1));
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 2> : dft_stage<T>
-{
- fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
- constexpr static bool aligned = false;
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- cvec<T, 1> a0, a1, a2, a3;
- split(cread<4>(in), a0, a1, a2, a3);
- butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
- cwrite<4>(out, concat(a0, a1, a2, a3));
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 3> : dft_stage<T>
-{
- fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
- constexpr static bool aligned = false;
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- cvec<T, 8> v8 = cread<8, aligned>(in);
- butterfly8<inverse>(v8);
- cwrite<8, aligned>(out, v8);
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 4> : dft_stage<T>
-{
- fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
- constexpr static bool aligned = false;
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- cvec<T, 16> v16 = cread<16, aligned>(in);
- butterfly16<inverse>(v16);
- cwrite<16, aligned>(out, v16);
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 5> : dft_stage<T>
-{
- fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
- constexpr static bool aligned = false;
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- cvec<T, 32> v32 = cread<32, aligned>(in);
- butterfly32<inverse>(v32);
- cwrite<32, aligned>(out, v32);
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 6> : dft_stage<T>
-{
- fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-
-protected:
- constexpr static bool aligned = false;
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 7> : dft_stage<T>
-{
- fft_specialization(size_t)
- {
- this->name = type_name<decltype(*this)>();
- this->stage_size = 128;
- this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
- }
-
-protected:
- constexpr static bool aligned = false;
- constexpr static size_t width = platform<T>::vector_width;
- constexpr static bool use_br2 = true;
- constexpr static bool prefetch = false;
- constexpr static bool is_double = sizeof(T) == 8;
- constexpr static size_t final_size = is_double ? 8 : 32;
- constexpr static size_t split_format = final_size == 8;
-
- virtual void do_initialize(size_t total_size) override final
- {
- complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- initialize_twiddles<T, width>(twiddle, 128, total_size, split_format);
- initialize_twiddles<T, width>(twiddle, 32, total_size, split_format);
- initialize_twiddles<T, width>(twiddle, 8, total_size, split_format);
- }
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
- final_pass<inverse>(csize_t<final_size>(), out, in, twiddle);
- if (this->need_reorder)
- fft_reorder(out, csize_t<7>());
- }
-
- template <bool inverse>
- KFR_INTRIN void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
- {
- radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
- cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
- radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
- cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-
- template <bool inverse>
- KFR_INTRIN void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in, const complex<T>* twiddle)
- {
- radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
- cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
- radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
- cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
- }
-};
-
-template <>
-struct fft_specialization<float, 8> : dft_stage<float>
-{
- fft_specialization(size_t)
- {
- this->name = type_name<decltype(*this)>();
- this->temp_size = sizeof(complex<float>) * 256;
- }
-
-protected:
- using T = float;
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
- {
- complex<float>* scratch = ptr_cast<complex<float>>(temp);
- if (out == in)
- {
- butterfly16_multi_flip<0, inverse>(scratch, out);
- butterfly16_multi_flip<1, inverse>(scratch, out);
- butterfly16_multi_flip<2, inverse>(scratch, out);
- butterfly16_multi_flip<3, inverse>(scratch, out);
-
- butterfly16_multi_natural<0, inverse>(out, scratch);
- butterfly16_multi_natural<1, inverse>(out, scratch);
- butterfly16_multi_natural<2, inverse>(out, scratch);
- butterfly16_multi_natural<3, inverse>(out, scratch);
- }
- else
- {
- butterfly16_multi_flip<0, inverse>(out, in);
- butterfly16_multi_flip<1, inverse>(out, in);
- butterfly16_multi_flip<2, inverse>(out, in);
- butterfly16_multi_flip<3, inverse>(out, in);
-
- butterfly16_multi_natural<0, inverse>(out, out);
- butterfly16_multi_natural<1, inverse>(out, out);
- butterfly16_multi_natural<2, inverse>(out, out);
- butterfly16_multi_natural<3, inverse>(out, out);
- }
- }
-};
-
-template <>
-struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
-{
- using T = double;
- fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
- {
- this->name = type_name<decltype(*this)>();
- }
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
- if (this->need_reorder)
- fft_reorder(out, csize_t<8>());
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
-{
- fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
- {
- this->name = type_name<decltype(*this)>();
- }
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
- if (this->need_reorder)
- fft_reorder(out, csize_t<9>());
- }
-};
-
-template <typename T>
-struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
-{
- fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
- {
- this->name = type_name<decltype(*this)>();
- }
-
- DFT_STAGE_FN
- template <bool inverse>
- KFR_INTRIN void do_execute(complex<T>* out, const complex<T>* in, u8*)
- {
- fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
- if (this->need_reorder)
- fft_reorder(out, 10, cfalse);
- }
-};
-
-} // namespace internal
-
-//
-
-template <typename T>
-template <typename Stage, typename... Args>
-void dft_plan<T>::add_stage(Args... args)
-{
- dft_stage<T>* stage = new Stage(args...);
- stage->need_reorder = need_reorder;
- this->data_size += stage->data_size;
- this->temp_size += stage->temp_size;
- stages.push_back(dft_stage_ptr(stage));
-}
+} // namespace intrinsics
template <typename T>
template <bool is_final>
@@ -1325,366 +465,83 @@ void dft_plan<T>::prepare_dft_stage(size_t radix, size_t iterations, size_t bloc
{
return cswitch(
dft_radices, radix,
- [&](auto radix) CMT_INLINE_LAMBDA {
- add_stage<conditional<is_final, internal::dft_stage_fixed_final_impl<T, val_of(radix)>,
- internal::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations,
- blocks);
+ [this, iterations, blocks](auto radix) CMT_INLINE_LAMBDA {
+ add_stage<conditional<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>,
+ intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>(radix, iterations,
+ blocks);
},
- [&]() { add_stage<internal::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks); });
+ [this, radix, iterations, blocks]() {
+ add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(radix, iterations, blocks);
+ });
}
template <typename T>
-template <bool is_even, bool first>
-void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>)
+void dft_plan<T>::init_dft(size_t size, dft_order)
{
- constexpr size_t final_size = is_even ? 1024 : 512;
-
- if (stage_size >= 2048)
+ if (size == 60)
{
- add_stage<internal::fft_stage_impl<T, !first, is_even>>(stage_size);
-
- make_fft(stage_size / 4, cbool_t<is_even>(), cfalse);
+ this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>();
}
- else
+ else if (size == 48)
{
- add_stage<internal::fft_final_stage_impl<T, !first, final_size>>(final_size);
+ this->add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>();
}
-}
-
-template <typename T>
-struct reverse_wrapper
-{
- T& iterable;
-};
-
-template <typename T>
-auto begin(reverse_wrapper<T> w)
-{
- return std::rbegin(w.iterable);
-}
-
-template <typename T>
-auto end(reverse_wrapper<T> w)
-{
- return std::rend(w.iterable);
-}
-
-template <typename T>
-reverse_wrapper<T> reversed(T&& iterable)
-{
- return { iterable };
-}
-
-template <typename T>
-void dft_plan<T>::initialize()
-{
- data = autofree<u8>(data_size);
- size_t offset = 0;
- for (dft_stage_ptr& stage : stages)
- {
- stage->data = data.data() + offset;
- stage->initialize(this->size);
- offset += stage->data_size;
- }
-
- bool to_scratch = false;
- bool scratch_needed = false;
- for (dft_stage_ptr& stage : reversed(stages))
- {
- if (to_scratch)
- {
- scratch_needed = true;
- }
- stage->to_scratch = to_scratch;
- if (!stage->can_inplace)
- {
- to_scratch = !to_scratch;
- }
- }
- if (scratch_needed || !stages[0]->can_inplace)
- this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
-}
-
-template <typename T>
-const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
- const complex<T>* scratch, bool in_scratch) const
-{
- if (stage == 0)
- return in_scratch ? scratch : in;
- return stages[stage - 1]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
-{
- return stages[stage]->to_scratch ? scratch : out;
-}
-
-template <typename T>
-template <bool inverse>
-void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
-{
- if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
- {
- return stages[0]->execute(cbool<inverse>, out, in, temp);
- }
- size_t stack[32] = { 0 };
-
- complex<T>* scratch =
- ptr_cast<complex<T>>(temp + this->temp_size -
- align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
-
- bool in_scratch = !stages[0]->can_inplace && in == out;
- if (in_scratch)
+ else
{
- internal::builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
- }
-
- const size_t count = stages.size();
+ size_t cur_size = size;
+ constexpr size_t radices_count = dft_radices.back() + 1;
+ u8 count[radices_count] = { 0 };
+ int radices[32] = { 0 };
+ size_t radices_size = 0;
- for (size_t depth = 0; depth < count;)
- {
- if (stages[depth]->recursion)
- {
- size_t offset = 0;
- size_t rdepth = depth;
- size_t maxdepth = depth;
- do
+ cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
+ while (cur_size && cur_size % val_of(radix) == 0)
{
- if (stack[rdepth] == stages[rdepth]->repeats)
- {
- stack[rdepth] = 0;
- rdepth--;
- }
- else
- {
- complex<T>* rout = select_out(rdepth, out, scratch);
- const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
- stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
- offset += stages[rdepth]->out_offset;
- stack[rdepth]++;
- if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
- rdepth++;
- else
- maxdepth = rdepth;
- }
- } while (rdepth != depth);
- depth = maxdepth + 1;
- }
- else
- {
- stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
- select_in(depth, out, in, scratch, in_scratch), temp);
- depth++;
- }
- }
-}
+ count[val_of(radix)]++;
+ cur_size /= val_of(radix);
+ }
+ });
-template <typename T>
-dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
-{
- need_reorder = true;
- if (is_poweroftwo(size))
- {
- const size_t log2n = ilog2(size);
- cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n,
- [&](auto log2n) {
- (void)log2n;
- constexpr size_t log2nv = val_of(decltype(log2n)());
- this->add_stage<internal::fft_specialization<T, log2nv>>(size);
- },
- [&]() {
- cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
- this->make_fft(size, is_even, ctrue);
- constexpr size_t is_evenv = val_of(decltype(is_even)());
- if (need_reorder)
- this->add_stage<internal::fft_reorder_stage_impl<T, is_evenv>>(size);
- });
- });
- }
-#ifndef KFR_DFT_NO_NPo2
- else
- {
- if (size == 60)
- {
- this->add_stage<internal::dft_special_stage_impl<T, 6, 10>>();
- }
- else if (size == 48)
+ if (cur_size >= 101)
{
- this->add_stage<internal::dft_special_stage_impl<T, 6, 8>>();
+ this->add_stage<intrinsics::dft_arblen_stage_impl<T>>(size);
}
else
{
- size_t cur_size = size;
- constexpr size_t radices_count = dft_radices.back() + 1;
- u8 count[radices_count] = { 0 };
- int radices[32] = { 0 };
- size_t radices_size = 0;
-
- cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], [&](auto radix) {
- while (cur_size && cur_size % val_of(radix) == 0)
- {
- count[val_of(radix)]++;
- cur_size /= val_of(radix);
- }
- });
+ size_t blocks = 1;
+ size_t iterations = size;
- if (cur_size >= 101)
+ for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
{
- this->add_stage<internal::dft_arblen_stage_impl<T>>(size);
- }
- else
- {
- size_t blocks = 1;
- size_t iterations = size;
-
- for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++)
- {
- for (size_t i = 0; i < count[r]; i++)
- {
- iterations /= r;
- radices[radices_size++] = r;
- if (iterations == 1)
- this->prepare_dft_stage(r, iterations, blocks, ctrue);
- else
- this->prepare_dft_stage(r, iterations, blocks, cfalse);
- blocks *= r;
- }
- }
-
- if (cur_size > 1)
+ for (size_t i = 0; i < count[r]; i++)
{
- iterations /= cur_size;
- radices[radices_size++] = cur_size;
+ iterations /= r;
+ radices[radices_size++] = r;
if (iterations == 1)
- this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+ this->prepare_dft_stage(r, iterations, blocks, ctrue);
else
- this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+ this->prepare_dft_stage(r, iterations, blocks, cfalse);
+ blocks *= r;
}
-
- if (stages.size() > 2)
- this->add_stage<internal::dft_reorder_stage_impl<T>>(radices, radices_size);
}
- }
- }
-#endif
- initialize();
-}
-
-template <typename T>
-dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
-{
- using namespace internal;
- constexpr size_t width = platform<T>::vector_width * 2;
-
- block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
- constexpr size_t width = val_of(decltype(w)());
- cwrite<width>(rtwiddle.data() + i,
- cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
- });
-}
-
-template <typename T>
-void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
-{
- using namespace internal;
- size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
-
- constexpr size_t width = platform<T>::vector_width * 2;
- const cvec<T, 1> dc = cread<1>(out);
- const size_t count = csize / 2;
-
- block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
- i++;
- constexpr size_t width = val_of(decltype(w)());
- constexpr size_t widthm1 = width - 1;
- const cvec<T, width> tw = cread<width>(rtwiddle.data() + i);
- const cvec<T, width> fpk = cread<width>(out + i);
- const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1)));
-
- const cvec<T, width> f1k = fpk + fpnk;
- const cvec<T, width> f2k = fpk - fpnk;
- const cvec<T, width> t = cmul(f2k, tw);
- cwrite<width>(out + i, T(0.5) * (f1k + t));
- cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t))));
- });
-
- {
- size_t k = csize / 2;
- const cvec<T, 1> fpk = cread<1>(out + k);
- const cvec<T, 1> fpnk = negodd(fpk);
- cwrite<1>(out + k, fpnk);
- }
- if (fmt == dft_pack_format::CCs)
- {
- cwrite<1>(out, pack(dc[0] + dc[1], 0));
- cwrite<1>(out + csize, pack(dc[0] - dc[1], 0));
- }
- else
- {
- cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1]));
- }
-}
-
-template <typename T>
-void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const
-{
- using namespace internal;
-
- const size_t csize = this->size / 2;
-
- cvec<T, 1> dc;
-
- if (fmt == dft_pack_format::CCs)
- {
- dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real());
- }
- else
- {
- dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag());
- }
-
- constexpr size_t width = platform<T>::vector_width * 2;
- const size_t count = csize / 2;
-
- block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
- i++;
- constexpr size_t width = val_of(decltype(w)());
- constexpr size_t widthm1 = width - 1;
- const cvec<T, width> tw = cread<width>(rtwiddle.data() + i);
- const cvec<T, width> fpk = cread<width>(in + i);
- const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
-
- const cvec<T, width> f1k = fpk + fpnk;
- const cvec<T, width> f2k = fpk - fpnk;
- const cvec<T, width> t = cmul_conj(f2k, tw);
- cwrite<width>(out + i, f1k + t);
- cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
- });
+ if (cur_size > 1)
+ {
+ iterations /= cur_size;
+ radices[radices_size++] = cur_size;
+ if (iterations == 1)
+ this->prepare_dft_stage(cur_size, iterations, blocks, ctrue);
+ else
+ this->prepare_dft_stage(cur_size, iterations, blocks, cfalse);
+ }
- {
- size_t k = csize / 2;
- const cvec<T, 1> fpk = cread<1>(in + k);
- const cvec<T, 1> fpnk = 2 * negodd(fpk);
- cwrite<1>(out + k, fpnk);
+ if (stages.size() > 2)
+ this->add_stage<intrinsics::dft_reorder_stage_impl<T>>(radices, radices_size);
+ }
}
- cwrite<1>(out, dc);
}
-template <typename T>
-dft_plan<T>::~dft_plan()
-{
-}
-
-template <typename T>
-void dft_plan<T>::dump() const
-{
- for (const dft_stage_ptr& s : stages)
- {
- s->dump();
- }
-}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dft/impl/dft-src.cpp b/include/kfr/dft/impl/dft-src.cpp
@@ -24,7 +24,8 @@
See https://www.kfrlib.com for details.
*/
-#include "dft-impl.hpp"
+#include "../dft_c.h"
+#include "../fft.hpp"
namespace kfr
{
@@ -41,27 +42,26 @@ extern "C"
return reinterpret_cast<KFR_DFT_PLAN_F64*>(new kfr::dft_plan<double>(size));
}
- void kfr_dft_execute_f32(KFR_DFT_PLAN_F32* plan, size_t size, float* out, const float* in, uint8_t* temp)
+ void kfr_dft_execute_f32(KFR_DFT_PLAN_F32* plan, size_t, float* out, const float* in, uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan<float>*>(plan)->execute(
reinterpret_cast<kfr::complex<float>*>(out), reinterpret_cast<const kfr::complex<float>*>(in),
temp, kfr::cfalse);
}
- void kfr_dft_execute_f64(KFR_DFT_PLAN_F64* plan, size_t size, double* out, const double* in,
- uint8_t* temp)
+ void kfr_dft_execute_f64(KFR_DFT_PLAN_F64* plan, size_t, double* out, const double* in, uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan<double>*>(plan)->execute(
reinterpret_cast<kfr::complex<double>*>(out), reinterpret_cast<const kfr::complex<double>*>(in),
temp, kfr::cfalse);
}
- void kfr_dft_execute_inverse_f32(KFR_DFT_PLAN_F32* plan, size_t size, float* out, const float* in,
+ void kfr_dft_execute_inverse_f32(KFR_DFT_PLAN_F32* plan, size_t, float* out, const float* in,
uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan<float>*>(plan)->execute(
reinterpret_cast<kfr::complex<float>*>(out), reinterpret_cast<const kfr::complex<float>*>(in),
temp, kfr::ctrue);
}
- void kfr_dft_execute_inverse_f64(KFR_DFT_PLAN_F64* plan, size_t size, double* out, const double* in,
+ void kfr_dft_execute_inverse_f64(KFR_DFT_PLAN_F64* plan, size_t, double* out, const double* in,
uint8_t* temp)
{
reinterpret_cast<kfr::dft_plan<double>*>(plan)->execute(
@@ -89,29 +89,29 @@ extern "C"
return reinterpret_cast<KFR_DFT_REAL_PLAN_F64*>(new kfr::dft_plan_real<double>(size));
}
- void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out, const float* in,
+ void kfr_dft_execute_real_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
{
reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
reinterpret_cast<kfr::complex<float>*>(out), in, temp,
static_cast<kfr::dft_pack_format>(pack_format));
}
- void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out, const double* in,
+ void kfr_dft_execute_real_f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
{
reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
reinterpret_cast<kfr::complex<double>*>(out), in, temp,
static_cast<kfr::dft_pack_format>(pack_format));
}
- void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t size, float* out,
- const float* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+ void kfr_dft_execute_real_inverse_f32(KFR_DFT_REAL_PLAN_F32* plan, size_t, float* out, const float* in,
+ uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
{
reinterpret_cast<kfr::dft_plan_real<float>*>(plan)->execute(
out, reinterpret_cast<const kfr::complex<float>*>(in), temp,
static_cast<kfr::dft_pack_format>(pack_format));
}
- void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t size, double* out,
- const double* in, uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
+ void kfr_dft_execute_real_inverse__f64(KFR_DFT_REAL_PLAN_F64* plan, size_t, double* out, const double* in,
+ uint8_t* temp, KFR_DFT_PACK_FORMAT pack_format)
{
reinterpret_cast<kfr::dft_plan_real<double>*>(plan)->execute(
out, reinterpret_cast<const kfr::complex<double>*>(in), temp,
diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp
@@ -29,19 +29,13 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
-template dft_plan<FLOAT>::dft_plan(size_t, dft_order);
-template dft_plan<FLOAT>::~dft_plan();
-template void dft_plan<FLOAT>::dump() const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out,
- const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out,
- const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
-template dft_plan_real<FLOAT>::dft_plan_real(size_t);
-template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in,
- kfr::dft_pack_format fmt) const;
-template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const;
-
+#ifndef KFR_DFT_NO_NPo2
+template void dft_plan<FLOAT>::init_dft(size_t, dft_order);
+#endif
+} // namespace CMT_ARCH_NAME
} // namespace kfr
#endif
diff --git a/include/kfr/dft/impl/fft-impl-f32.cpp b/include/kfr/dft/impl/fft-impl-f32.cpp
@@ -0,0 +1,29 @@
+/** @addtogroup dft
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#include "fft-impl.hpp"
+
+#define FLOAT float
+#include "fft-templates.hpp"
diff --git a/include/kfr/dft/impl/fft-impl-f64.cpp b/include/kfr/dft/impl/fft-impl-f64.cpp
@@ -0,0 +1,29 @@
+/** @addtogroup dft
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#include "fft-impl.hpp"
+
+#define FLOAT double
+#include "fft-templates.hpp"
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -0,0 +1,1148 @@
+/** @addtogroup dft
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "dft-fft.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+#endif
+#if CMT_HAS_WARNING("-Wunused-lambda-capture")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture")
+#endif
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4100))
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <size_t width, bool inverse, typename T>
+KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>,
+ const cvec<T, width>& w, const cvec<T, width>& tw)
+{
+ cvec<T, width> ww = w;
+ cvec<T, width> tw_ = tw;
+ cvec<T, width> b1 = ww * dupeven(tw_);
+ ww = swap<2>(ww);
+
+ if (inverse)
+ tw_ = -(tw_);
+ ww = subadd(b1, ww * dupodd(tw_));
+ return ww;
+}
+
+template <size_t width, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>,
+ cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in,
+ const complex<T>* twiddle)
+{
+ const size_t N4 = N / 4;
+ cvec<T, width> w1, w2, w3;
+
+ cvec<T, width> sum02, sum13, diff02, diff13;
+
+ cvec<T, width> a0, a1, a2, a3;
+ a0 = cread<width, aligned>(in + 0);
+ a2 = cread<width, aligned>(in + N4 * 2);
+ sum02 = a0 + a2;
+
+ a1 = cread<width, aligned>(in + N4);
+ a3 = cread<width, aligned>(in + N4 * 3);
+ sum13 = a1 + a3;
+
+ cwrite<width, aligned>(out, sum02 + sum13);
+ w2 = sum02 - sum13;
+ cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2),
+ radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2,
+ cread<width, true>(twiddle + width)));
+ diff02 = a0 - a2;
+ diff13 = a1 - a3;
+ if (inverse)
+ {
+ diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
+ diff13 = swap<2>(diff13);
+ }
+ else
+ {
+ diff13 = swap<2>(diff13);
+ diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
+ }
+
+ w1 = diff02 + diff13;
+
+ cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1),
+ radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1,
+ cread<width, true>(twiddle + 0)));
+ w3 = diff02 - diff13;
+ cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(),
+ w3, cread<width, true>(twiddle + width * 2)));
+}
+
+template <size_t width, bool inverse, typename T>
+KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>,
+ const cvec<T, width>& w, const cvec<T, width>& tw)
+{
+ vec<T, width> re1, im1, twre, twim;
+ split(w, re1, im1);
+ split(tw, twre, twim);
+
+ const vec<T, width> b1re = re1 * twre;
+ const vec<T, width> b1im = im1 * twre;
+ if (inverse)
+ return concat(b1re + im1 * twim, b1im - re1 * twim);
+ else
+ return concat(b1re - im1 * twim, b1im + re1 * twim);
+}
+
+template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>,
+ cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
+ const complex<T>* in, const complex<T>* twiddle)
+{
+ const size_t N4 = N / 4;
+ cvec<T, width> w1, w2, w3;
+ constexpr bool read_split = !splitin && splitout;
+ constexpr bool write_split = splitin && !splitout;
+
+ vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
+
+ split(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
+ split(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
+ split(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
+ split(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
+
+ const vec<T, width> sum02re = re0 + re2;
+ const vec<T, width> sum02im = im0 + im2;
+ const vec<T, width> sum13re = re1 + re3;
+ const vec<T, width> sum13im = im1 + im3;
+
+ cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im));
+ w2 = concat(sum02re - sum13re, sum02im - sum13im);
+ cwrite_split<width, aligned, write_split>(
+ out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2,
+ cread<width, true>(twiddle + width)));
+
+ const vec<T, width> diff02re = re0 - re2;
+ const vec<T, width> diff02im = im0 - im2;
+ const vec<T, width> diff13re = re1 - re3;
+ const vec<T, width> diff13im = im1 - im3;
+
+ (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
+ (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
+
+ cwrite_split<width, aligned, write_split>(
+ out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1,
+ cread<width, true>(twiddle + 0)));
+ cwrite_split<width, aligned, write_split>(
+ out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3,
+ cread<width, true>(twiddle + width * 2)));
+}
+
+template <typename T>
+CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
+{
+ if (n == 0)
+ {
+ return make_vector(static_cast<T>(1), static_cast<T>(0));
+ }
+ else if (n == size / 4)
+ {
+ return make_vector(static_cast<T>(0), static_cast<T>(-1));
+ }
+ else if (n == size / 2)
+ {
+ return make_vector(static_cast<T>(-1), static_cast<T>(0));
+ }
+ else if (n == size * 3 / 4)
+ {
+ return make_vector(static_cast<T>(0), static_cast<T>(1));
+ }
+ else
+ {
+ fbase kth = c_pi<fbase, 2> * (n / static_cast<fbase>(size));
+ fbase tcos = +kfr::cos(kth);
+ fbase tsin = -kfr::sin(kth);
+ return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
+ }
+}
+
+template <typename T, size_t width>
+KFR_INTRINSIC void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size,
+ bool split_format)
+{
+ vec<T, 2 * width> result = T();
+ CMT_LOOP_UNROLL
+ for (size_t i = 0; i < width; i++)
+ {
+ const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size);
+ result[i * 2] = r[0];
+ result[i * 2 + 1] = r[1];
+ }
+ if (split_format)
+ ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result);
+ else
+ ref_cast<cvec<T, width>>(twiddle[0]) = result;
+ twiddle += width;
+}
+
+template <typename T, size_t width>
+CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
+{
+ const size_t count = stage_size / 4;
+ size_t nnstep = size / stage_size;
+ DFT_ASSERT(width <= count);
+ CMT_LOOP_NOUNROLL
+ for (size_t n = 0; n < count; n += width)
+ {
+ initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format);
+ initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format);
+ initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format);
+ }
+}
+
+#ifdef KFR_NO_PREFETCH
+#define KFR_PREFETCH(addr) \
+ do \
+ { \
+ (void)(addr); \
+ } while (0)
+#else
+
+#if defined CMT_ARCH_SSE
+#ifdef CMT_COMPILER_GNU
+#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
+#else
+#define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0);
+#endif
+#else
+#define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr));
+#endif
+#endif
+
+template <typename T>
+KFR_INTRINSIC void prefetch_one(const complex<T>* in)
+{
+ KFR_PREFETCH(in);
+}
+
+template <typename T>
+KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in)
+{
+ KFR_PREFETCH(in);
+ KFR_PREFETCH(in + stride);
+ KFR_PREFETCH(in + stride * 2);
+ KFR_PREFETCH(in + stride * 3);
+}
+
+template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
+ bool inverse, bool aligned, typename T>
+KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>,
+ cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>,
+ cbool_t<aligned>, complex<T>* out, const complex<T>* in,
+ const complex<T>*& twiddle)
+{
+ constexpr static size_t prefetch_offset = width * 8;
+ const auto N4 = N / csize_t<4>();
+ const auto N43 = N4 * csize_t<3>();
+ CMT_ASSUME(blocks > 0);
+ CMT_ASSUME(N > 0);
+ CMT_ASSUME(N4 > 0);
+ DFT_ASSERT(width <= N4);
+ CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
+ {
+ CMT_PRAGMA_CLANG(clang loop unroll_count(2))
+ for (size_t n2 = 0; n2 < N4; n2 += width)
+ {
+ if (prefetch)
+ prefetch_four(N4, in + prefetch_offset);
+ radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
+ cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
+ in, twiddle + n2 * 3);
+ in += width;
+ out += width;
+ }
+ in += N43;
+ out += N43;
+ }
+ twiddle += N43;
+ return {};
+}
+
+template <bool splitin, size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cfalse_t, cbool_t<splitin>,
+ cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+ complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+ CMT_ASSUME(blocks > 0);
+ constexpr static size_t prefetch_offset = 32 * 4;
+ for (size_t b = 0; b < blocks; b++)
+ {
+ if (prefetch)
+ prefetch_four(csize_t<64>(), out + prefetch_offset);
+ cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
+ split(cread_split<8, aligned, splitin>(out + 0), w0, w1);
+ split(cread_split<8, aligned, splitin>(out + 8), w2, w3);
+ split(cread_split<8, aligned, splitin>(out + 16), w4, w5);
+ split(cread_split<8, aligned, splitin>(out + 24), w6, w7);
+
+ butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
+
+ w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
+ w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
+ w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
+ w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
+ w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
+ w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
+ w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
+
+ cvec<T, 8> z0, z1, z2, z3;
+ transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
+
+ butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
+ cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3)));
+ out += 32;
+ }
+ return {};
+}
+
+template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
+ cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+ complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+ CMT_ASSUME(blocks > 0);
+ DFT_ASSERT(2 <= blocks);
+ constexpr static size_t prefetch_offset = width * 16;
+ for (size_t b = 0; b < blocks; b += 2)
+ {
+ if (prefetch)
+ prefetch_one(out + prefetch_offset);
+
+ cvec<T, 8> vlo = cread<8, aligned>(out + 0);
+ cvec<T, 8> vhi = cread<8, aligned>(out + 8);
+ butterfly8<inverse>(vlo);
+ butterfly8<inverse>(vhi);
+ vlo = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vlo);
+ vhi = permutegroups<(2), 0, 4, 2, 6, 1, 5, 3, 7>(vhi);
+ cwrite<8, aligned>(out, vlo);
+ cwrite<8, aligned>(out + 8, vhi);
+ out += 16;
+ }
+ return {};
+}
+
+template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
+ cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+ complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+ CMT_ASSUME(blocks > 0);
+ constexpr static size_t prefetch_offset = width * 4;
+ DFT_ASSERT(2 <= blocks);
+ CMT_PRAGMA_CLANG(clang loop unroll_count(2))
+ for (size_t b = 0; b < blocks; b += 2)
+ {
+ if (prefetch)
+ prefetch_one(out + prefetch_offset);
+
+ cvec<T, 16> vlo = cread<16, aligned>(out);
+ cvec<T, 16> vhi = cread<16, aligned>(out + 16);
+ butterfly4<4, inverse>(vlo);
+ butterfly4<4, inverse>(vhi);
+ apply_twiddles4<0, 4, 4, inverse>(vlo);
+ apply_twiddles4<0, 4, 4, inverse>(vhi);
+ vlo = digitreverse4<2>(vlo);
+ vhi = digitreverse4<2>(vhi);
+ butterfly4<4, inverse>(vlo);
+ butterfly4<4, inverse>(vhi);
+
+ use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
+ use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
+ out += 32;
+ }
+ return {};
+}
+
+template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
+KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
+ cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
+ complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
+{
+ constexpr static size_t prefetch_offset = width * 4;
+ CMT_ASSUME(blocks > 0);
+ DFT_ASSERT(4 <= blocks);
+ CMT_LOOP_NOUNROLL
+ for (size_t b = 0; b < blocks; b += 4)
+ {
+ if (prefetch)
+ prefetch_one(out + prefetch_offset);
+
+ cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
+ butterfly4<4, inverse>(v16);
+ cdigitreverse4_write<aligned>(out, v16);
+
+ out += 4 * 4;
+ }
+ return {};
+}
+
+template <typename T, bool splitin, bool is_even>
+struct fft_stage_impl : dft_stage<T>
+{
+ fft_stage_impl(size_t stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->radix = 4;
+ this->stage_size = stage_size;
+ this->repeats = 4;
+ this->recursion = true;
+ this->data_size =
+ align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
+ }
+
+protected:
+ constexpr static bool prefetch = true;
+ constexpr static bool aligned = false;
+ constexpr static size_t width = fft_vector_width<T>;
+
+ virtual void do_initialize(size_t size) override final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddles<T, width>(twiddle, this->stage_size, size, true);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ if (splitin)
+ in = out;
+ const size_t stg_size = this->stage_size;
+ CMT_ASSUME(stg_size >= 2048);
+ CMT_ASSUME(stg_size % 2048 == 0);
+ radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<!is_even>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+ }
+};
+
+template <typename T, bool splitin, size_t size>
+struct fft_final_stage_impl : dft_stage<T>
+{
+ fft_final_stage_impl(size_t)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->radix = size;
+ this->stage_size = size;
+ this->out_offset = size;
+ this->repeats = 4;
+ this->recursion = true;
+ this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
+ }
+
+protected:
+ constexpr static size_t width = fft_vector_width<T>;
+ constexpr static bool is_even = cometa::is_even(ilog2(size));
+ constexpr static bool use_br2 = !is_even;
+ constexpr static bool aligned = false;
+ constexpr static bool prefetch = splitin;
+
+ KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cfalse_t, complex<T>*&) {}
+ KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cfalse_t, complex<T>*&) {}
+
+ template <size_t N, bool pass_splitin>
+ KFR_MEM_INTRINSIC void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>,
+ complex<T>*& twiddle)
+ {
+ constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width;
+ constexpr size_t pass_width = const_min(width, N / 4);
+ initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_split || pass_splitin);
+ init_twiddles(csize<N / 4>, total_size, cbool<pass_split>, twiddle);
+ }
+
+ virtual void do_initialize(size_t total_size) override final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
+ }
+
+ template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<32>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+ const complex<T>*& twiddle)
+ {
+ radix4_pass(csize_t<32>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ }
+
+ template <bool inverse, typename U = T, KFR_ENABLE_IF(is_same<U, float>::value)>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+ const complex<T>*& twiddle)
+ {
+ radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ }
+
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+ const complex<T>*& twiddle)
+ {
+ radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ }
+
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cfalse_t, complex<T>* out, const complex<T>*,
+ const complex<T>*& twiddle)
+ {
+ radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ }
+
+ template <bool inverse, size_t N, bool pass_splitin>
+ KFR_MEM_INTRINSIC void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
+ const complex<T>* in, const complex<T>*& twiddle)
+ {
+ static_assert(N > 8, "");
+ constexpr bool pass_split = N / 4 > 8 && N / 4 / 4 >= width;
+ constexpr size_t pass_width = const_min(width, N / 4);
+ static_assert(pass_width == width || (pass_split == pass_splitin), "");
+ static_assert(pass_width <= N / 4, "");
+ radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_split>, cbool_t<pass_splitin>(),
+ cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in,
+ twiddle);
+ final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_split>, out, out, twiddle);
+ }
+};
+
+template <typename T, bool is_even>
+struct fft_reorder_stage_impl : dft_stage<T>
+{
+ fft_reorder_stage_impl(size_t stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->stage_size = stage_size;
+ log2n = ilog2(stage_size);
+ this->data_size = 0;
+ }
+
+protected:
+ size_t log2n;
+
+ virtual void do_initialize(size_t) override final {}
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
+ {
+ fft_reorder(out, log2n, cbool_t<!is_even>());
+ }
+};
+
+template <typename T, size_t log2n>
+struct fft_specialization;
+
+template <typename T>
+struct fft_specialization<T, 1> : dft_stage<T>
+{
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+ constexpr static bool aligned = false;
+ DFT_STAGE_FN
+
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ cvec<T, 1> a0, a1;
+ split(cread<2, aligned>(in), a0, a1);
+ cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1));
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 2> : dft_stage<T>
+{
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+ constexpr static bool aligned = false;
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ cvec<T, 1> a0, a1, a2, a3;
+ split(cread<4>(in), a0, a1, a2, a3);
+ butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
+ cwrite<4>(out, concat(a0, a1, a2, a3));
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 3> : dft_stage<T>
+{
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+ constexpr static bool aligned = false;
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ cvec<T, 8> v8 = cread<8, aligned>(in);
+ butterfly8<inverse>(v8);
+ cwrite<8, aligned>(out, v8);
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 4> : dft_stage<T>
+{
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+ constexpr static bool aligned = false;
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ cvec<T, 16> v16 = cread<16, aligned>(in);
+ butterfly16<inverse>(v16);
+ cwrite<16, aligned>(out, v16);
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 5> : dft_stage<T>
+{
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+ constexpr static bool aligned = false;
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ cvec<T, 32> v32 = cread<32, aligned>(in);
+ butterfly32<inverse>(v32);
+ cwrite<32, aligned>(out, v32);
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 6> : dft_stage<T>
+{
+ fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
+
+protected:
+ constexpr static bool aligned = false;
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ butterfly64(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 7> : dft_stage<T>
+{
+ fft_specialization(size_t)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->stage_size = 128;
+ this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
+ }
+
+protected:
+ constexpr static bool aligned = false;
+ constexpr static size_t width = vector_width<T>;
+ constexpr static bool use_br2 = true;
+ constexpr static bool prefetch = false;
+ constexpr static bool is_double = sizeof(T) == 8;
+ constexpr static size_t final_size = is_double ? 8 : 32;
+ constexpr static size_t split_format = final_size == 8;
+
+ virtual void do_initialize(size_t total_size) override final
+ {
+ complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ initialize_twiddles<T, width>(twiddle, 128, total_size, split_format);
+ initialize_twiddles<T, width>(twiddle, 32, total_size, split_format);
+ initialize_twiddles<T, width>(twiddle, 8, total_size, split_format);
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
+ final_pass<inverse>(csize_t<final_size>(), out, in, twiddle);
+ if (this->need_reorder)
+ fft_reorder(out, csize_t<7>());
+ }
+
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void final_pass(csize_t<8>, complex<T>* out, const complex<T>* in,
+ const complex<T>* twiddle)
+ {
+ radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+ cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+ radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
+ cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ }
+
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void final_pass(csize_t<32>, complex<T>* out, const complex<T>* in,
+ const complex<T>* twiddle)
+ {
+ radix4_pass(128, 1, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
+ cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
+ radix4_pass(csize_t<32>(), 4, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
+ cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
+ }
+};
+
+template <>
+struct fft_specialization<float, 8> : dft_stage<float>
+{
+ fft_specialization(size_t)
+ {
+ this->name = type_name<decltype(*this)>();
+ this->temp_size = sizeof(complex<float>) * 256;
+ }
+
+protected:
+ using T = float;
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
+ {
+ complex<float>* scratch = ptr_cast<complex<float>>(temp);
+ if (out == in)
+ {
+ butterfly16_multi_flip<0, inverse>(scratch, out);
+ butterfly16_multi_flip<1, inverse>(scratch, out);
+ butterfly16_multi_flip<2, inverse>(scratch, out);
+ butterfly16_multi_flip<3, inverse>(scratch, out);
+
+ butterfly16_multi_natural<0, inverse>(out, scratch);
+ butterfly16_multi_natural<1, inverse>(out, scratch);
+ butterfly16_multi_natural<2, inverse>(out, scratch);
+ butterfly16_multi_natural<3, inverse>(out, scratch);
+ }
+ else
+ {
+ butterfly16_multi_flip<0, inverse>(out, in);
+ butterfly16_multi_flip<1, inverse>(out, in);
+ butterfly16_multi_flip<2, inverse>(out, in);
+ butterfly16_multi_flip<3, inverse>(out, in);
+
+ butterfly16_multi_natural<0, inverse>(out, out);
+ butterfly16_multi_natural<1, inverse>(out, out);
+ butterfly16_multi_natural<2, inverse>(out, out);
+ butterfly16_multi_natural<3, inverse>(out, out);
+ }
+ }
+};
+
+template <>
+struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
+{
+ using T = double;
+ fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
+ if (this->need_reorder)
+ fft_reorder(out, csize_t<8>());
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
+{
+ fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
+ if (this->need_reorder)
+ fft_reorder(out, csize_t<9>());
+ }
+};
+
+template <typename T>
+struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
+{
+ fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
+ {
+ this->name = type_name<decltype(*this)>();
+ }
+
+ DFT_STAGE_FN
+ template <bool inverse>
+ KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
+ {
+ fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
+ if (this->need_reorder)
+ fft_reorder(out, 10, cfalse);
+ }
+};
+
+} // namespace intrinsics
+
+template <typename T>
+template <bool is_even, bool first>
+void dft_plan<T>::make_fft(size_t stage_size, cbool_t<is_even>, cbool_t<first>)
+{
+ constexpr size_t final_size = is_even ? 1024 : 512;
+
+ if (stage_size >= 2048)
+ {
+ add_stage<intrinsics::fft_stage_impl<T, !first, is_even>>(stage_size);
+
+ make_fft(stage_size / 4, cbool_t<is_even>(), cfalse);
+ }
+ else
+ {
+ add_stage<intrinsics::fft_final_stage_impl<T, !first, final_size>>(final_size);
+ }
+}
+
+template <typename T>
+struct reverse_wrapper
+{
+ T& iterable;
+};
+
+template <typename T>
+auto begin(reverse_wrapper<T> w)
+{
+ return std::rbegin(w.iterable);
+}
+
+template <typename T>
+auto end(reverse_wrapper<T> w)
+{
+ return std::rend(w.iterable);
+}
+
+template <typename T>
+reverse_wrapper<T> reversed(T&& iterable)
+{
+ return { iterable };
+}
+
+template <typename T>
+void dft_plan<T>::initialize()
+{
+ data = autofree<u8>(data_size);
+ size_t offset = 0;
+ for (dft_stage_ptr& stage : stages)
+ {
+ stage->data = data.data() + offset;
+ stage->initialize(this->size);
+ offset += stage->data_size;
+ }
+
+ bool to_scratch = false;
+ bool scratch_needed = false;
+ for (dft_stage_ptr& stage : reversed(stages))
+ {
+ if (to_scratch)
+ {
+ scratch_needed = true;
+ }
+ stage->to_scratch = to_scratch;
+ if (!stage->can_inplace)
+ {
+ to_scratch = !to_scratch;
+ }
+ }
+ if (scratch_needed || !stages[0]->can_inplace)
+ this->temp_size += align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment);
+}
+
+template <typename T>
+const complex<T>* dft_plan<T>::select_in(size_t stage, const complex<T>* out, const complex<T>* in,
+ const complex<T>* scratch, bool in_scratch) const
+{
+ if (stage == 0)
+ return in_scratch ? scratch : in;
+ return stages[stage - 1]->to_scratch ? scratch : out;
+}
+
+template <typename T>
+complex<T>* dft_plan<T>::select_out(size_t stage, complex<T>* out, complex<T>* scratch) const
+{
+ return stages[stage]->to_scratch ? scratch : out;
+}
+
+template <typename T>
+template <bool inverse>
+void dft_plan<T>::execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
+{
+ if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+ {
+ return stages[0]->execute(cbool<inverse>, out, in, temp);
+ }
+ size_t stack[32] = { 0 };
+
+ complex<T>* scratch =
+ ptr_cast<complex<T>>(temp + this->temp_size -
+ align_up(sizeof(complex<T>) * this->size, platform<>::native_cache_alignment));
+
+ bool in_scratch = !stages[0]->can_inplace && in == out;
+ if (in_scratch)
+ {
+ builtin_memcpy(scratch, in, sizeof(complex<T>) * this->size);
+ }
+
+ const size_t count = stages.size();
+
+ for (size_t depth = 0; depth < count;)
+ {
+ if (stages[depth]->recursion)
+ {
+ size_t offset = 0;
+ size_t rdepth = depth;
+ size_t maxdepth = depth;
+ do
+ {
+ if (stack[rdepth] == stages[rdepth]->repeats)
+ {
+ stack[rdepth] = 0;
+ rdepth--;
+ }
+ else
+ {
+ complex<T>* rout = select_out(rdepth, out, scratch);
+ const complex<T>* rin = select_in(rdepth, out, in, scratch, in_scratch);
+ stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
+ offset += stages[rdepth]->out_offset;
+ stack[rdepth]++;
+ if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
+ rdepth++;
+ else
+ maxdepth = rdepth;
+ }
+ } while (rdepth != depth);
+ depth = maxdepth + 1;
+ }
+ else
+ {
+ stages[depth]->execute(cbool<inverse>, select_out(depth, out, scratch),
+ select_in(depth, out, in, scratch, in_scratch), temp);
+ depth++;
+ }
+ }
+}
+
+template <typename T>
+void dft_plan<T>::init_fft(size_t size, dft_order)
+{
+ const size_t log2n = ilog2(size);
+ cswitch(csizes_t<1, 2, 3, 4, 5, 6, 7, 8, 9, 10>(), log2n,
+ [&](auto log2n) {
+ (void)log2n;
+ constexpr size_t log2nv = val_of(decltype(log2n)());
+ this->add_stage<intrinsics::fft_specialization<T, log2nv>>(size);
+ },
+ [&]() {
+ cswitch(cfalse_true, is_even(log2n), [&](auto is_even) {
+ this->make_fft(size, is_even, ctrue);
+ constexpr size_t is_evenv = val_of(decltype(is_even)());
+ if (need_reorder)
+ this->add_stage<intrinsics::fft_reorder_stage_impl<T, is_evenv>>(size);
+ });
+ });
+}
+
+template <typename T>
+dft_plan<T>::dft_plan(size_t size, dft_order order) : size(size), temp_size(0), data_size(0)
+{
+ need_reorder = true;
+ if (is_poweroftwo(size))
+ {
+ init_fft(size, order);
+ }
+#ifndef KFR_DFT_NO_NPo2
+ else
+ {
+ init_dft(size, order);
+ }
+#endif
+ initialize();
+}
+
+template <typename T>
+dft_plan_real<T>::dft_plan_real(size_t size) : dft_plan<T>(size / 2), size(size), rtwiddle(size / 4)
+{
+ using namespace intrinsics;
+
+ constexpr size_t width = vector_width<T> * 2;
+
+ block_process(size / 4, csizes_t<width, 1>(), [=](size_t i, auto w) {
+ constexpr size_t width = val_of(decltype(w)());
+ cwrite<width>(rtwiddle.data() + i,
+ cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
+ });
+}
+
+template <typename T>
+void dft_plan_real<T>::to_fmt(complex<T>* out, dft_pack_format fmt) const
+{
+ using namespace intrinsics;
+ size_t csize = this->size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
+
+ constexpr size_t width = vector_width<T> * 2;
+ const cvec<T, 1> dc = cread<1>(out);
+ const size_t count = csize / 2;
+
+ block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
+ i++;
+ constexpr size_t width = val_of(decltype(w)());
+ constexpr size_t widthm1 = width - 1;
+ const cvec<T, width> tw = cread<width>(rtwiddle.data() + i);
+ const cvec<T, width> fpk = cread<width>(out + i);
+ const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(out + csize - i - widthm1)));
+
+ const cvec<T, width> f1k = fpk + fpnk;
+ const cvec<T, width> f2k = fpk - fpnk;
+ const cvec<T, width> t = cmul(f2k, tw);
+ cwrite<width>(out + i, T(0.5) * (f1k + t));
+ cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t))));
+ });
+
+ {
+ size_t k = csize / 2;
+ const cvec<T, 1> fpk = cread<1>(out + k);
+ const cvec<T, 1> fpnk = negodd(fpk);
+ cwrite<1>(out + k, fpnk);
+ }
+ if (fmt == dft_pack_format::CCs)
+ {
+ cwrite<1>(out, pack(dc[0] + dc[1], 0));
+ cwrite<1>(out + csize, pack(dc[0] - dc[1], 0));
+ }
+ else
+ {
+ cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1]));
+ }
+}
+
+template <typename T>
+void dft_plan_real<T>::from_fmt(complex<T>* out, const complex<T>* in, dft_pack_format fmt) const
+{
+ using namespace intrinsics;
+
+ const size_t csize = this->size / 2;
+
+ cvec<T, 1> dc;
+
+ if (fmt == dft_pack_format::CCs)
+ {
+ dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real());
+ }
+ else
+ {
+ dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag());
+ }
+
+ constexpr size_t width = vector_width<T> * 2;
+ const size_t count = csize / 2;
+
+ block_process(count - 1, csizes_t<width, 1>(), [&](size_t i, auto w) {
+ i++;
+ constexpr size_t width = val_of(decltype(w)());
+ constexpr size_t widthm1 = width - 1;
+ const cvec<T, width> tw = cread<width>(rtwiddle.data() + i);
+ const cvec<T, width> fpk = cread<width>(in + i);
+ const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
+
+ const cvec<T, width> f1k = fpk + fpnk;
+ const cvec<T, width> f2k = fpk - fpnk;
+ const cvec<T, width> t = cmul_conj(f2k, tw);
+ cwrite<width>(out + i, f1k + t);
+ cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
+ });
+
+ {
+ size_t k = csize / 2;
+ const cvec<T, 1> fpk = cread<1>(in + k);
+ const cvec<T, 1> fpnk = 2 * negodd(fpk);
+ cwrite<1>(out + k, fpnk);
+ }
+ cwrite<1>(out, dc);
+}
+
+template <typename T>
+dft_plan<T>::~dft_plan()
+{
+}
+
+template <typename T>
+void dft_plan<T>::dump() const
+{
+ for (const dft_stage_ptr& s : stages)
+ {
+ s->dump();
+ }
+}
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp
@@ -0,0 +1,50 @@
+/** @addtogroup dft
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+
+#ifdef FLOAT
+#include "../fft.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template dft_plan<FLOAT>::dft_plan(size_t, dft_order);
+template void dft_plan<FLOAT>::init_fft(size_t, dft_order);
+template dft_plan<FLOAT>::~dft_plan();
+template void dft_plan<FLOAT>::dump() const;
+template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<false>, kfr::complex<FLOAT>* out,
+ const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
+template void dft_plan<FLOAT>::execute_dft(cometa::cbool_t<true>, kfr::complex<FLOAT>* out,
+ const kfr::complex<FLOAT>* in, kfr::u8* temp) const;
+template dft_plan_real<FLOAT>::dft_plan_real(size_t);
+template void dft_plan_real<FLOAT>::from_fmt(kfr::complex<FLOAT>* out, const kfr::complex<FLOAT>* in,
+ kfr::dft_pack_format fmt) const;
+template void dft_plan_real<FLOAT>::to_fmt(kfr::complex<FLOAT>* out, kfr::dft_pack_format fmt) const;
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+#endif
diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp
@@ -25,40 +25,45 @@
*/
#pragma once
-#include "../../base/complex.hpp"
-#include "../../base/constants.hpp"
-#include "../../base/digitreverse.hpp"
-#include "../../base/read_write.hpp"
-#include "../../base/sin_cos.hpp"
#include "../../base/small_buffer.hpp"
#include "../../base/univector.hpp"
-#include "../../base/vec.hpp"
+#include "../../math/sin_cos.hpp"
+#include "../../simd/complex.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/digitreverse.hpp"
+#include "../../simd/read_write.hpp"
+#include "../../simd/vec.hpp"
#include "../../base/memory.hpp"
-#include "../../data/sincos.hpp"
+#include "../data/sincos.hpp"
CMT_PRAGMA_MSVC(warning(push))
CMT_PRAGMA_MSVC(warning(disable : 4127))
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, size_t N>
+using cvec = vec<T, N * 2>;
-namespace internal
+namespace intrinsics
{
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y)
{
return subadd(x * dupeven(y), swap<2>(x) * dupodd(y));
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y)
+KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y)
{
vec<T, N> yy = resize<N>(y);
return cmul_impl(x, yy);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
{
vec<T, N> xx = resize<N>(x);
return cmul_impl(xx, y);
@@ -66,24 +71,24 @@ CMT_INLINE vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
/// Complex Multiplication
template <typename T, size_t N1, size_t N2>
-CMT_INLINE vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y)
+KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y)
{
- return internal::cmul_impl(x, y);
+ return intrinsics::cmul_impl(x, y);
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y)
{
return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw)
+KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw)
{
return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw);
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1,
- const vec<T, N>& tw)
+KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1,
+ const vec<T, N>& tw)
{
const vec<T, N> twr = dupeven(tw);
const vec<T, N> twi = dupodd(tw);
@@ -95,82 +100,79 @@ CMT_INLINE void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in
out1 += sumtw - diftw;
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y)
+KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y)
{
vec<T, N> yy = resize<N>(y);
return cmul_conj(x, yy);
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
-CMT_INLINE vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y)
+KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y)
{
vec<T, N> xx = resize<N>(x);
return cmul_conj(xx, y);
}
-template <typename T, size_t N>
-using cvec = vec<T, N * 2>;
-
template <size_t N, bool A = false, typename T>
-CMT_INLINE cvec<T, N> cread(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src)
{
return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
}
template <size_t N, bool A = false, typename T>
-CMT_INLINE void cwrite(complex<T>* dest, const cvec<T, N>& value)
+KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value)
{
value.write(ptr_cast<T>(dest));
}
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
+KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
{
return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
}
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
-CMT_INLINE void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
+KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
{
swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
}
template <size_t count, size_t N, bool A, typename T, size_t... indices>
-CMT_INLINE cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
+KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
{
return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
}
template <size_t count, size_t N, bool A, typename T, size_t... indices>
-CMT_INLINE void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
- csizes_t<indices...>)
+KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
+ csizes_t<indices...>)
{
swallow{ (write<A>(ptr_cast<T>(dest + stride * indices), slice<indices * N * 2, N * 2>(value)), 0)... };
}
template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src)
+KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src)
{
return cread_group_impl<count, N, stride, A>(src, csizeseq_t<count>());
}
template <size_t count, size_t N, size_t stride, bool A = false, typename T>
-CMT_INLINE void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value)
+KFR_INTRINSIC void cwrite_group(complex<T>* dest, const cvec<T, count * N>& value)
{
return cwrite_group_impl<count, N, stride, A>(dest, value, csizeseq_t<count>());
}
template <size_t count, size_t N, bool A = false, typename T>
-CMT_INLINE cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
+KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
{
return cread_group_impl<count, N, A>(src, stride, csizeseq_t<count>());
}
template <size_t count, size_t N, bool A = false, typename T>
-CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value)
+KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value)
{
return cwrite_group_impl<count, N, A>(dest, stride, value, csizeseq_t<count>());
}
template <size_t N, bool A = false, bool split = false, typename T>
-CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src)
{
cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
if (split)
@@ -179,7 +181,7 @@ CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
}
template <size_t N, bool A = false, bool split = false, typename T>
-CMT_INLINE void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
+KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
{
cvec<T, N> v = value;
if (split)
@@ -262,13 +264,13 @@ inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64,
}
template <size_t N, size_t stride, typename T, size_t... Indices>
-CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
+KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
{
return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...);
}
template <size_t N, size_t stride, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base)
{
if (stride == 1)
{
@@ -278,7 +280,7 @@ CMT_INLINE cvec<T, N> cgather(const complex<T>* base)
return cgather_helper<N, stride, T>(base, csizeseq_t<N>());
}
-CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
+KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
{
size_t temp = index;
index += stride;
@@ -286,7 +288,7 @@ CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t size, size_t
index -= size;
return temp;
}
-CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
+KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t)
{
size_t temp = index;
index += stride;
@@ -294,45 +296,45 @@ CMT_INLINE size_t cgather_next(size_t& index, size_t stride, size_t)
}
template <size_t N, typename T, size_t... Indices>
-CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
- csizes_t<Indices...>)
+KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
+ csizes_t<Indices...>)
{
return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...);
}
template <size_t N, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
{
return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
}
template <size_t N, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t stride)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride)
{
size_t index = 0;
return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
}
template <size_t N, typename T, size_t... Indices>
-CMT_INLINE cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
- csizes_t<Indices...>)
+KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
+ csizes_t<Indices...>)
{
return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...);
}
template <size_t N, typename T>
-CMT_INLINE cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
+KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
{
return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>());
}
template <size_t N, size_t stride, typename T, size_t... Indices>
-CMT_INLINE void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>)
+KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>)
{
swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
}
template <size_t N, size_t stride, typename T>
-CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value)
+KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value)
{
if (stride == 1)
{
@@ -345,34 +347,35 @@ CMT_INLINE void cscatter(complex<T>* base, const cvec<T, N>& value)
}
template <size_t N, typename T, size_t... Indices>
-CMT_INLINE void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value,
- csizes_t<Indices...>)
+KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value,
+ csizes_t<Indices...>)
{
swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
}
template <size_t N, typename T>
-CMT_INLINE void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value)
+KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value)
{
return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>());
}
template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset)
+KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset)
{
return gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>());
}
template <size_t groupsize = 1, typename T, size_t N, typename IT>
-CMT_INLINE void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value)
+KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value)
{
return scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>());
}
template <typename T>
-KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2,
- const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2,
- cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, cvec<T, 4>& w7)
+KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2,
+ const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2,
+ cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6,
+ cvec<T, 4>& w7)
{
cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3));
cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3));
@@ -389,10 +392,10 @@ KFR_INTRIN void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const c
}
template <typename T>
-KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2,
- const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5,
- const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
- cvec<T, 8>& z2, cvec<T, 8>& z3)
+KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2,
+ const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5,
+ const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
+ cvec<T, 8>& z2, cvec<T, 8>& z3)
{
cvec<T, 16> a = concat(w0, w1, w2, w3);
cvec<T, 16> b = concat(w4, w5, w6, w7);
@@ -405,7 +408,7 @@ KFR_INTRIN void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const c
}
template <typename T>
-void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
+KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
{
cvec<T, 4> a0, a1, a2, a3;
cvec<T, 4> b0, b1, b2, b3;
@@ -423,8 +426,8 @@ void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
d = concat(a3, b3, c3, d3);
}
template <typename T>
-void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa,
- cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd)
+KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa,
+ cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd)
{
cvec<T, 4> a0, a1, a2, a3;
cvec<T, 4> b0, b1, b2, b3;
@@ -443,35 +446,35 @@ void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d,
}
template <bool b, typename T>
-constexpr KFR_INTRIN T chsign(T x)
+constexpr KFR_INTRINSIC T chsign(T x)
{
return b ? -x : x;
}
template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false,
size_t... indices>
-constexpr KFR_INTRIN cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>)
+constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>)
{
return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start)))
: cos_using_table<T>(size, (indices / 2 * step + start)))...);
}
template <typename T, size_t width, size_t... indices>
-constexpr KFR_INTRIN cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, size_t start,
- size_t step)
+constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size,
+ size_t start, size_t step)
{
return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start)
: cos_using_table<T>(size, indices / 2 * step + start))...);
}
template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false>
-constexpr KFR_INTRIN cvec<T, width> fixed_twiddle()
+constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle()
{
return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>());
}
template <typename T, size_t width>
-constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0)
+constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0)
{
return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size);
}
@@ -480,7 +483,7 @@ constexpr KFR_INTRIN cvec<T, width> fixed_twiddle(size_t size, size_t start, siz
// constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>();
template <typename T, size_t N, bool inverse>
-constexpr cvec<T, N> twiddleimagmask()
+constexpr KFR_INTRINSIC cvec<T, N> twiddleimagmask()
{
return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1);
}
@@ -498,7 +501,7 @@ CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x)
template <size_t k, size_t size, bool inverse = false, typename T, size_t width,
size_t kk = (inverse ? size - k : k) % size>
-KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
+KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
{
constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485);
if (kk == 0)
@@ -540,7 +543,7 @@ KFR_INTRIN vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
}
template <size_t N, typename T>
-KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
+KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
{
const cvec<T, N> sum = a0 + a1;
const cvec<T, N> dif = a0 - a1;
@@ -549,15 +552,15 @@ KFR_INTRIN void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N
}
template <size_t N, typename T>
-KFR_INTRIN void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1)
+KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1)
{
butterfly2<N>(a0, a1, a0, a1);
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
- const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
- cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
+ const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
+ cvec<T, N>& w2, cvec<T, N>& w3)
{
cvec<T, N> sum02, sum13, diff02, diff13;
cvec<T, N * 2> a01, a23, sum0213, diff0213;
@@ -589,9 +592,9 @@ KFR_INTRIN void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, cons
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
- const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
- cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
+ const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
+ cvec<T, N>& w2, cvec<T, N>& w3)
{
vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3;
vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3;
@@ -616,11 +619,11 @@ KFR_INTRIN void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
- const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
- const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1,
- cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6,
- cvec<T, N>& w7)
+KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+ const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+ const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1,
+ cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6,
+ cvec<T, N>& w7)
{
cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6;
butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6);
@@ -642,14 +645,14 @@ KFR_INTRIN void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
- cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7)
+KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+ cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7)
{
butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7);
}
template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67)
+KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67)
{
cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67;
@@ -670,7 +673,7 @@ KFR_INTRIN void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cv
}
template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly8(cvec<T, 8>& v8)
+KFR_INTRINSIC void butterfly8(cvec<T, 8>& v8)
{
cvec<T, 2> w0, w1, w2, w3;
split(v8, w0, w1, w2, w3);
@@ -679,7 +682,7 @@ KFR_INTRIN void butterfly8(cvec<T, 8>& v8)
}
template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly32(cvec<T, 32>& v32)
+KFR_INTRINSIC void butterfly32(cvec<T, 32>& v32)
{
cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
split(v32, w0, w1, w2, w3, w4, w5, w6, w7);
@@ -701,7 +704,7 @@ KFR_INTRIN void butterfly32(cvec<T, 32>& v32)
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly4(cvec<T, N * 4>& a0123)
+KFR_INTRINSIC void butterfly4(cvec<T, N * 4>& a0123)
{
cvec<T, N> a0;
cvec<T, N> a1;
@@ -713,7 +716,7 @@ KFR_INTRIN void butterfly4(cvec<T, N * 4>& a0123)
}
template <size_t N, typename T>
-KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01)
+KFR_INTRINSIC void butterfly2(cvec<T, N * 2>& a01)
{
cvec<T, N> a0;
cvec<T, N> a1;
@@ -723,7 +726,7 @@ KFR_INTRIN void butterfly2(cvec<T, N * 2>& a01)
}
template <size_t N, bool inverse = false, bool split_format = false, typename T>
-KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
+KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
{
if (split_format)
{
@@ -750,9 +753,9 @@ KFR_INTRIN void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<
}
template <size_t N, bool inverse = false, bool split_format = false, typename T>
-KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3,
- const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3,
- cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
+KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3,
+ const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3,
+ cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
{
apply_twiddle<N, inverse, split_format>(a1, tw1, w1);
apply_twiddle<N, inverse, split_format>(a2, tw2, w2);
@@ -760,31 +763,31 @@ KFR_INTRIN void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, cons
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
- cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2,
- const cvec<T, N>& tw3)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
+ cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2,
+ const cvec<T, N>& tw3)
{
apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3);
}
template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]>
-KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
- cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2,
- const cvec<T, 1>& tw3)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
+ cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2,
+ const cvec<T, 1>& tw3)
{
apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
}
template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]>
-KFR_INTRIN void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
- cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2,
- cvec<T, N / 2> tw3)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
+ cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2,
+ cvec<T, N / 2> tw3)
{
apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d)
+KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d)
{
cvec<T, 4> b0, b1, b2, b3;
cvec<T, 4> c0, c1, c2, c3;
@@ -812,7 +815,7 @@ KFR_INTRIN void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, c
}
template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T>
-KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
+KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
{
cvec<T, N> a0;
cvec<T, N> a1;
@@ -830,7 +833,7 @@ KFR_INTRIN void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
}
template <bool inverse, bool aligned, typename T>
-KFR_INTRIN void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in)
{
cvec<T, 16> w0, w1, w2, w3;
@@ -886,7 +889,7 @@ KFR_INTRIN void butterfly64(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
}
template <bool inverse = false, typename T>
-KFR_INTRIN void butterfly16(cvec<T, 16>& v16)
+KFR_INTRINSIC void butterfly16(cvec<T, 16>& v16)
{
butterfly4<4, inverse>(v16);
apply_twiddles4<0, 4, 4, inverse>(v16);
@@ -895,7 +898,7 @@ KFR_INTRIN void butterfly16(cvec<T, 16>& v16)
}
template <size_t index, bool inverse = false, typename T>
-KFR_INTRIN void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
{
constexpr size_t N = 4;
@@ -954,7 +957,7 @@ KFR_INTRIN void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
}
template <size_t index, bool inverse = false, typename T>
-KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
+KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
{
constexpr size_t N = 4;
@@ -1011,7 +1014,7 @@ KFR_INTRIN void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
}
template <size_t n2, size_t nnstep, size_t N, typename T>
-KFR_INTRIN void apply_twiddles2(cvec<T, N>& a1)
+KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1)
{
cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>();
@@ -1026,8 +1029,8 @@ static const cvec<T, N> tw3i1 =
static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, cvec<T, N>& w01,
- cvec<T, N>& w02)
+KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00,
+ cvec<T, N>& w01, cvec<T, N>& w02)
{
const cvec<T, N> sum1 = a01 + a02;
@@ -1043,15 +1046,16 @@ KFR_INTRIN void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2)
+KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2)
{
butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2);
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
- const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, cvec<T, N>& w0,
- cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5)
+KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+ const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+ cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
+ cvec<T, N>& w5)
{
cvec<T, N* 2> a03 = concat(a0, a3);
cvec<T, N* 2> a25 = concat(a2, a5);
@@ -1073,8 +1077,8 @@ KFR_INTRIN void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
- cvec<T, N>& a5)
+KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+ cvec<T, N>& a5)
{
butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5);
}
@@ -1090,11 +1094,11 @@ const static cvec<T, 1> tw9_4 = { T(-0.93969262078590838405410927732473),
(inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) };
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
- const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
- const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, cvec<T, N>& w0,
- cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5,
- cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
+KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+ const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+ const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
+ cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
+ cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
{
cvec<T, N* 3> a012 = concat(a0, a1, a2);
cvec<T, N* 3> a345 = concat(a3, a4, a5);
@@ -1121,8 +1125,8 @@ KFR_INTRIN void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cve
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
- cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
+KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+ cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
{
butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8);
}
@@ -1149,9 +1153,10 @@ static const cvec<T, N> tw7i3 =
static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
- cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
- cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06)
+KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+ cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01,
+ cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05,
+ cvec<T, N>& w06)
{
const cvec<T, N> sum1 = a01 + a06;
const cvec<T, N> dif1 = swap<2>(a01 - a06);
@@ -1184,8 +1189,8 @@ KFR_INTRIN void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
- cvec<T, N>& a5, cvec<T, N>& a6)
+KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
+ cvec<T, N>& a5, cvec<T, N>& a6)
{
butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6);
}
@@ -1226,11 +1231,11 @@ static const cvec<T, N> tw11i5 =
static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>();
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
- cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
- cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
- cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
- cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
+KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
+ cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
+ cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
+ cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
+ cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
{
const cvec<T, N> sum1 = a01 + a10;
const cvec<T, N> dif1 = swap<2>(a01 - a10);
@@ -1300,9 +1305,9 @@ const static cvec<T, N> tw5i2 =
static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
- const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
- cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
+KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
+ const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
+ cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
{
const cvec<T, N> sum1 = a01 + a04;
const cvec<T, N> dif1 = swap<2>(a01 - a04);
@@ -1323,12 +1328,12 @@ KFR_INTRIN void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const c
}
template <size_t N, bool inverse = false, typename T>
-KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
- const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
- const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
- const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2,
- cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7,
- cvec<T, N>& w8, cvec<T, N>& w9)
+KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
+ const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
+ const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
+ const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2,
+ cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7,
+ cvec<T, N>& w8, cvec<T, N>& w9)
{
cvec<T, N* 2> a05 = concat(a0, a5);
cvec<T, N* 2> a27 = concat(a2, a7);
@@ -1363,91 +1368,96 @@ KFR_INTRIN void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cv
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0,
- vec<T, N>& out1)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0,
+ vec<T, N>& out1)
{
butterfly2<N / 2>(in0, in1, out0, out1);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2)
{
butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
- vec<T, N>& out3)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1,
+ vec<T, N>& out2, vec<T, N>& out3)
{
butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, vec<T, N>& out0, vec<T, N>& out1,
- vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
+ vec<T, N>& out4)
{
butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5, vec<T, N>& out0,
- vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+ vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
{
butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
- const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
- vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1,
+ vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
+ vec<T, N>& out6)
{
butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
- const vec<T, N>& in6, const vec<T, N>& in7, vec<T, N>& out0, vec<T, N>& out1,
- vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
- vec<T, N>& out7)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+ vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
+ vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7)
{
butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5,
out6, out7);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
- const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8, vec<T, N>& out0,
- vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
- vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+ const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
+ vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
+ vec<T, N>& out7, vec<T, N>& out8)
{
butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4,
out5, out6, out7, out8);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
- const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
- const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
- vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
- vec<T, N>& out8, vec<T, N>& out9)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+ const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1,
+ vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
+ vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9)
{
butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3,
out4, out5, out6, out7, out8, out9);
}
template <bool inverse, typename T, size_t N>
-KFR_INTRIN void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& in2,
- const vec<T, N>& in3, const vec<T, N>& in4, const vec<T, N>& in5,
- const vec<T, N>& in6, const vec<T, N>& in7, const vec<T, N>& in8,
- const vec<T, N>& in9, const vec<T, N>& in10, vec<T, N>& out0, vec<T, N>& out1,
- vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
- vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
+KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
+ const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
+ const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
+ const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10,
+ vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
+ vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
+ vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
{
butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2,
out3, out4, out5, out6, out7, out8, out9, out10);
}
template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
-KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
+KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
{
vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr));
if (transposed)
@@ -1456,8 +1466,8 @@ KFR_INTRIN void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec
}
// Warning: Reads past the end. Use with care
-KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, cvec<f32, 4>& w1,
- cvec<f32, 4>& w2)
+KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0,
+ cvec<f32, 4>& w1, cvec<f32, 4>& w2)
{
cvec<f32, 4> w3;
cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9));
@@ -1465,8 +1475,8 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3
split(v16, w0, w1, w2, w3);
}
-KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, cvec<f32, 4>& w1,
- cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4)
+KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0,
+ cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4)
{
cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15));
v16 = digitreverse4<2>(v16);
@@ -1475,7 +1485,7 @@ KFR_INTRIN void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f3
}
template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
-KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
+KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
{
auto temp = concat(args...);
if (transposed)
@@ -1484,20 +1494,21 @@ KFR_INTRIN void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N
}
template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
-KFR_INTRIN vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle)
+KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle)
{
return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1)));
}
template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
-KFR_INTRIN vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle)
+KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle)
{
return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1)));
}
// Non-final
template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
-KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>,
- complex<T>* out, const complex<T>* in, const complex<T>* tw, size_t stride)
+KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>,
+ cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+ const complex<T>* tw, size_t stride)
{
carray<cvec<T, width>, radix> inout;
@@ -1513,8 +1524,8 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize
// Final
template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
-KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, cbool_t<inverse>,
- complex<T>* out, const complex<T>* in, size_t stride)
+KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>,
+ cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride)
{
carray<cvec<T, width>, radix> inout;
@@ -1527,17 +1538,17 @@ KFR_INTRIN void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize
}
template <size_t width, size_t radix, typename... Args>
-KFR_INTRIN void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
+KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
{
butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...);
}
template <typename... Args>
-KFR_INTRIN void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...)
+KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...)
{
}
template <size_t width, typename... Args>
-KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args)
+KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args)
{
CMT_LOOP_NOUNROLL
for (; i < count / width * width; i += width)
@@ -1546,7 +1557,7 @@ KFR_INTRIN void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&.
}
template <size_t width, typename... Args>
-KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
+KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args)
{
CMT_ASSUME(count > 0);
size_t i = 0;
@@ -1554,16 +1565,17 @@ KFR_INTRIN void butterflies(size_t count, csize_t<width>, Args&&... args)
}
template <typename T, bool inverse, typename Tradix, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<0>, Tradix radix, cbool_t<inverse>, complex<T>*,
- const complex<T>*, Tstride, size_t, size_t, const complex<T>*, size_t)
+KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*,
+ const complex<T>*, Tstride, size_t, size_t, const complex<T>*,
+ size_t)
{
}
template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix,
typename Thalfradixsqr, typename Tstride>
-KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
- const complex<T>* in, Tstride ostride, Thalfradix halfradix,
- Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
+KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
+ const complex<T>* in, Tstride ostride, Thalfradix halfradix,
+ Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
{
CMT_LOOP_NOUNROLL
for (; i < halfradix / width * width; i += width)
@@ -1605,19 +1617,19 @@ KFR_INTRIN void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<in
}
template <typename T>
-KFR_SINTRIN vec<T, 2> hcadd(vec<T, 2> value)
+KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value)
{
return value;
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)>
-KFR_SINTRIN vec<T, 2> hcadd(vec<T, N> value)
+KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value)
{
return hcadd(low(value) + high(value));
}
template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>>
-KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
- const complex<T>* twiddle, Tstride ostride = Tstride{})
+KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+ const complex<T>* twiddle, Tstride ostride = Tstride{})
{
CMT_ASSUME(radix > 0);
{
@@ -1636,8 +1648,7 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>*
}
cwrite<1>(out, hcadd(sum) + sums);
}
- const auto halfradix = radix / 2;
- const auto halfradix_sqr = halfradix * halfradix;
+ const auto halfradix = radix / 2;
CMT_ASSUME(halfradix > 0);
size_t i = 0;
@@ -1646,9 +1657,9 @@ KFR_INTRIN void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>*
}
template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>>
-KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
- const complex<T>* in, const complex<T>* twiddle,
- Tstride ostride = Tstride{})
+KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
+ const complex<T>* in, const complex<T>* twiddle,
+ Tstride ostride = Tstride{})
{
{
cvec<T, width> sum = T();
@@ -1676,16 +1687,16 @@ KFR_INTRIN void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, compl
}
template <typename T, bool inverse, typename Tstride = csize_t<1>>
-KFR_INTRIN void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
- complex<T>* temp, const complex<T>* twiddle, Tstride ostride = {})
+KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+ complex<T>*, const complex<T>* twiddle, Tstride ostride = {})
{
cswitch(csizes_t<11, 13>(), radix,
[&](auto radix_) CMT_INLINE_LAMBDA {
- constexpr size_t width = platform<T>::vector_width;
+ constexpr size_t width = vector_width<T>;
spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride);
},
[&]() CMT_INLINE_LAMBDA {
- constexpr size_t width = platform<T>::vector_width;
+ constexpr size_t width = vector_width<T>;
generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
});
}
@@ -1697,25 +1708,25 @@ template <typename T, size_t N>
constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T());
template <bool A = false, typename T, size_t N>
-KFR_INTRIN void cbitreverse_write(complex<T>* dest, const vec<T, N>& x)
+KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x)
{
cwrite<N / 2, A>(dest, bitreverse<2>(x));
}
template <bool A = false, typename T, size_t N>
-KFR_INTRIN void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x)
+KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x)
{
cwrite<N / 2, A>(dest, digitreverse4<2>(x));
}
template <size_t N, bool A = false, typename T>
-KFR_INTRIN cvec<T, N> cbitreverse_read(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src)
{
return bitreverse<2>(cread<N, A>(src));
}
template <size_t N, bool A = false, typename T>
-KFR_INTRIN cvec<T, N> cdigitreverse4_read(const complex<T>* src)
+KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src)
{
return digitreverse4<2>(cread<N, A>(src));
}
@@ -1723,7 +1734,7 @@ KFR_INTRIN cvec<T, N> cdigitreverse4_read(const complex<T>* src)
#if 1
template <>
-KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src)
+KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src)
{
return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12),
cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13),
@@ -1731,7 +1742,7 @@ KFR_INTRIN cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>*
cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15));
}
template <>
-KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x)
+KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x)
{
cwrite<1>(dest, part<16, 0>(x));
cwrite<1>(dest + 4, part<16, 1>(x));
@@ -1754,7 +1765,8 @@ KFR_INTRIN void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const v
cwrite<1>(dest + 15, part<16, 15>(x));
}
#endif
-} // namespace internal
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
} // namespace kfr
CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/dft/reference_dft.hpp b/include/kfr/dft/reference_dft.hpp
@@ -25,13 +25,13 @@
*/
#pragma once
-#include "../base/complex.hpp"
-#include "../base/constants.hpp"
#include "../base/memory.hpp"
-#include "../base/read_write.hpp"
#include "../base/small_buffer.hpp"
#include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/constants.hpp"
+#include "../simd/read_write.hpp"
+#include "../simd/vec.hpp"
#include <cmath>
#include <vector>
diff --git a/include/kfr/dsp.hpp b/include/kfr/dsp.hpp
@@ -33,7 +33,6 @@
#include "dsp/fir_design.hpp"
#include "dsp/fracdelay.hpp"
#include "dsp/goertzel.hpp"
-#include "dsp/interpolation.hpp"
#include "dsp/mixdown.hpp"
#include "dsp/oscillators.hpp"
#include "dsp/sample_rate_conversion.hpp"
diff --git a/include/kfr/dsp/biquad.hpp b/include/kfr/dsp/biquad.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup biquad
* @{
*/
/*
@@ -26,13 +26,16 @@
#pragma once
#include "../base/filter.hpp"
-#include "../base/function.hpp"
-#include "../base/operators.hpp"
#include "../base/pointer.hpp"
-#include "../base/vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
+#include "../testo/assert.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
enum class biquad_type
{
@@ -53,17 +56,24 @@ template <typename T>
struct biquad_params
{
template <typename U>
- constexpr biquad_params(const biquad_params<U>& bq) noexcept
- : a0(static_cast<T>(bq.a0)), a1(static_cast<T>(bq.a1)), a2(static_cast<T>(bq.a2)),
- b0(static_cast<T>(bq.b0)), b1(static_cast<T>(bq.b1)), b2(static_cast<T>(bq.b2))
+ constexpr biquad_params(const biquad_params<U>& bq) CMT_NOEXCEPT : a0(static_cast<T>(bq.a0)),
+ a1(static_cast<T>(bq.a1)),
+ a2(static_cast<T>(bq.a2)),
+ b0(static_cast<T>(bq.b0)),
+ b1(static_cast<T>(bq.b1)),
+ b2(static_cast<T>(bq.b2))
{
}
constexpr static bool is_pod = true;
static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
- constexpr biquad_params() noexcept : a0(1), a1(0), a2(0), b0(1), b1(0), b2(0) {}
- constexpr biquad_params(T a0, T a1, T a2, T b0, T b1, T b2) noexcept
- : a0(a0), a1(a1), a2(a2), b0(b0), b1(b1), b2(b2)
+ constexpr biquad_params() CMT_NOEXCEPT : a0(1), a1(0), a2(0), b0(1), b1(0), b2(0) {}
+ constexpr biquad_params(T a0, T a1, T a2, T b0, T b1, T b2) CMT_NOEXCEPT : a0(a0),
+ a1(a1),
+ a2(a2),
+ b0(b0),
+ b1(b1),
+ b2(b2)
{
}
T a0;
@@ -90,7 +100,7 @@ struct biquad_state
vec<T, filters> s1;
vec<T, filters> s2;
vec<T, filters> out;
- constexpr biquad_state() noexcept : s1(0), s2(0), out(0) {}
+ constexpr biquad_state() CMT_NOEXCEPT : s1(0), s2(0), out(0) {}
};
template <typename T, size_t filters, KFR_ARCH_DEP>
@@ -102,8 +112,8 @@ struct biquad_block
vec<T, filters> b1;
vec<T, filters> b2;
- constexpr biquad_block() noexcept : a1(0), a2(0), b0(1), b1(0), b2(0) {}
- CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) noexcept
+ constexpr biquad_block() CMT_NOEXCEPT : a1(0), a2(0), b0(1), b1(0), b2(0) {}
+ CMT_GNU_CONSTEXPR biquad_block(const biquad_params<T>* bq, size_t count) CMT_NOEXCEPT
{
count = count > filters ? filters : count;
for (size_t i = 0; i < count; i++)
@@ -125,38 +135,40 @@ struct biquad_block
}
template <size_t count>
- constexpr biquad_block(const biquad_params<T> (&bq)[count]) noexcept : biquad_block(bq, count)
+ constexpr biquad_block(const biquad_params<T> (&bq)[count]) CMT_NOEXCEPT : biquad_block(bq, count)
{
static_assert(count <= filters, "count > filters");
}
};
template <size_t filters, typename T, typename E1, KFR_ARCH_DEP>
-struct expression_biquads_l : public expression_base<E1>
+struct expression_biquads_l : public expression_with_arguments<E1>
{
using value_type = T;
expression_biquads_l(const biquad_block<T, filters>& bq, E1&& e1)
- : expression_base<E1>(std::forward<E1>(e1)), bq(bq)
+ : expression_with_arguments<E1>(std::forward<E1>(e1)), bq(bq)
{
}
template <size_t width>
- KFR_INTRIN vec<T, width> operator()(cinput_t cinput, size_t index, vec_t<T, width> t) const
+ friend KFR_INTRINSIC vec<T, width> get_elements(const expression_biquads_l& self, cinput_t cinput, size_t index,
+ vec_shape<T, width> t)
{
- const vec<T, width> in = this->argument_first(cinput, index, t);
+ const vec<T, width> in = self.argument_first(cinput, index, t);
vec<T, width> out;
CMT_LOOP_UNROLL
for (size_t i = 0; i < width; i++)
{
- state.out = process(bq, state, insertleft(in[i], state.out));
- out[i] = state.out[filters - 1];
+ self.state.out = process(self.bq, self.state, insertleft(in[i], self.state.out));
+ out[i] = self.state.out[filters - 1];
}
return out;
}
- KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state,
- const vec<T, filters>& in)
+ static KFR_MEM_INTRINSIC vec<T, filters> process(const biquad_block<T, filters>& bq,
+ biquad_state<T, filters>& state,
+ const vec<T, filters>& in)
{
const vec<T, filters> out = bq.b0 * in + state.s1;
state.s1 = state.s2 + bq.b1 * in - bq.a1 * out;
@@ -168,73 +180,74 @@ struct expression_biquads_l : public expression_base<E1>
};
template <size_t filters, typename T, typename E1, KFR_ARCH_DEP>
-struct expression_biquads : expression_base<E1>
+struct expression_biquads : expression_with_arguments<E1>
{
using value_type = T;
expression_biquads(const biquad_block<T, filters>& bq, E1&& e1)
- : expression_base<E1>(std::forward<E1>(e1)), bq(bq), block_end(0)
+ : expression_with_arguments<E1>(std::forward<E1>(e1)), bq(bq), block_end(0)
{
}
- CMT_INLINE void begin_block(cinput_t cinput, size_t size) const
+ void begin_block(cinput_t cinput, size_t size) const
{
block_end = size;
for (size_t i = 0; i < filters - 1; i++)
{
- const vec<T, 1> in = i < size ? this->argument_first(cinput, i, vec_t<T, 1>()) : 0;
+ const vec<T, 1> in = i < size ? this->argument_first(cinput, i, vec_shape<T, 1>()) : 0;
state.out = process(bq, state, insertleft(in[0], state.out));
}
}
- CMT_INLINE void end_block(cinput_t cinput, size_t) const { state = saved_state; }
+ void end_block(cinput_t, size_t) const { state = saved_state; }
template <size_t width>
- KFR_INTRIN vec<T, width> operator()(cinput_t cinput, size_t index, vec_t<T, width> t) const
+ friend KFR_INTRINSIC vec<T, width> get_elements(const expression_biquads& self, cinput_t cinput, size_t index,
+ vec_shape<T, width> t)
{
index += filters - 1;
- vec<T, width> out;
- if (index + width <= block_end)
+ vec<T, width> out{};
+ if (index + width <= self.block_end)
{
- const vec<T, width> in = this->argument_first(cinput, index, t);
+ const vec<T, width> in = self.argument_first(cinput, index, t);
CMT_LOOP_UNROLL
for (size_t i = 0; i < width; i++)
{
- state.out = process(bq, state, insertleft(in[i], state.out));
- out[i] = state.out[filters - 1];
+ self.state.out = process(self.bq, self.state, insertleft(in[i], self.state.out));
+ out[i] = self.state.out[filters - 1];
}
- if (index + width == block_end)
- saved_state = state;
+ if (index + width == self.block_end)
+ self.saved_state = self.state;
}
- else if (index >= block_end)
+ else if (index >= self.block_end)
{
CMT_LOOP_UNROLL
for (size_t i = 0; i < width; i++)
{
- state.out = process(bq, state, insertleft(T(0), state.out));
- out[i] = state.out[filters - 1];
+ self.state.out = process(self.bq, self.state, insertleft(T(0), self.state.out));
+ out[i] = self.state.out[filters - 1];
}
}
else
{
size_t i = 0;
- for (; i < std::min(width, block_end - index); i++)
+ for (; i < std::min(width, self.block_end - index); i++)
{
- const vec<T, 1> in = this->argument_first(cinput, index + i, vec_t<T, 1>());
- state.out = process(bq, state, insertleft(in[0], state.out));
- out[i] = state.out[filters - 1];
+ const vec<T, 1> in = self.argument_first(cinput, index + i, vec_shape<T, 1>());
+ self.state.out = process(self.bq, self.state, insertleft(in[0], self.state.out));
+ out[i] = self.state.out[filters - 1];
}
- saved_state = state;
+ self.saved_state = self.state;
for (; i < width; i++)
{
- state.out = process(bq, state, insertleft(T(0), state.out));
- out[i] = state.out[filters - 1];
+ self.state.out = process(self.bq, self.state, insertleft(T(0), self.state.out));
+ out[i] = self.state.out[filters - 1];
}
}
return out;
}
- KFR_SINTRIN vec<T, filters> process(const biquad_block<T, filters>& bq, biquad_state<T, filters>& state,
- vec<T, filters> in)
+ static KFR_MEM_INTRINSIC vec<T, filters> process(const biquad_block<T, filters>& bq,
+ biquad_state<T, filters>& state, vec<T, filters> in)
{
const vec<T, filters> out = bq.b0 * in + state.s1;
state.s1 = state.s2 + bq.b1 * in - bq.a1 * out;
@@ -255,7 +268,7 @@ struct expression_biquads : expression_base<E1>
* @param e1 Input expression
*/
template <typename T, typename E1>
-CMT_INLINE internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& bq, E1&& e1)
+KFR_FUNCTION internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>& bq, E1&& e1)
{
const biquad_params<T> bqs[1] = { bq };
return internal::expression_biquads<1, T, E1>(bqs, std::forward<E1>(e1));
@@ -268,8 +281,8 @@ CMT_INLINE internal::expression_biquads<1, T, E1> biquad(const biquad_params<T>&
* @note This implementation introduces delay of N - 1 samples, where N is the filter count.
*/
template <size_t filters, typename T, typename E1>
-CMT_INLINE internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_params<T> (&bq)[filters],
- E1&& e1)
+KFR_FUNCTION internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_params<T> (&bq)[filters],
+ E1&& e1)
{
return internal::expression_biquads_l<filters, T, E1>(bq, std::forward<E1>(e1));
}
@@ -281,7 +294,8 @@ CMT_INLINE internal::expression_biquads_l<filters, T, E1> biquad_l(const biquad_
* @note This implementation has zero latency
*/
template <size_t filters, typename T, typename E1>
-CMT_INLINE internal::expression_biquads<filters, T, E1> biquad(const biquad_params<T> (&bq)[filters], E1&& e1)
+KFR_FUNCTION internal::expression_biquads<filters, T, E1> biquad(const biquad_params<T> (&bq)[filters],
+ E1&& e1)
{
return internal::expression_biquads<filters, T, E1>(bq, std::forward<E1>(e1));
}
@@ -292,10 +306,11 @@ CMT_INLINE internal::expression_biquads<filters, T, E1> biquad(const biquad_para
* @param e1 Input expression
* @note This implementation has zero latency
*/
-template <typename T, typename E1>
-CMT_INLINE expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count, E1&& e1)
+template <size_t maxfiltercount = 4, typename T, typename E1>
+KFR_FUNCTION expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count, E1&& e1)
{
- return cswitch(csizes_t<1, 2, 4, 8, 16, 32, 64>(), next_poweroftwo(count),
+ constexpr csizes_t<1, 2, 4, 8, 16, 32, 64> sizes;
+ return cswitch(cfilter(sizes, sizes <= csize_t<maxfiltercount>{}), next_poweroftwo(count),
[&](auto x) {
constexpr size_t filters = x;
return to_pointer(internal::expression_biquads<filters, T, E1>(
@@ -304,12 +319,12 @@ CMT_INLINE expression_pointer<T> biquad(const biquad_params<T>* bq, size_t count
[&] { return to_pointer(zeros<T>()); });
}
-template <typename T>
+template <typename T, size_t maxfiltercount = 4>
class biquad_filter : public expression_filter<T>
{
public:
biquad_filter(const biquad_params<T>* bq, size_t count)
- : expression_filter<T>(biquad(bq, count, placeholder<T>()))
+ : expression_filter<T>(biquad<maxfiltercount>(bq, count, placeholder<T>()))
{
}
@@ -318,4 +333,5 @@ public:
{
}
};
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/biquad_design.hpp b/include/kfr/dsp/biquad_design.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup biquad
* @{
*/
/*
@@ -30,6 +30,8 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
/**
* @brief Calculates coefficients for the all-pass biquad filter
@@ -38,7 +40,7 @@ namespace kfr
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
{
const T alpha = std::sin(frequency) / 2.0 * Q;
const T cs = std::cos(frequency);
@@ -59,7 +61,7 @@ biquad_params<T> biquad_allpass(identity<T> frequency, identity<T> Q)
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
{
const T K = std::tan(c_pi<T, 1> * frequency);
const T K2 = K * K;
@@ -79,7 +81,7 @@ biquad_params<T> biquad_lowpass(identity<T> frequency, identity<T> Q)
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
{
const T K = std::tan(c_pi<T, 1> * frequency);
const T K2 = K * K;
@@ -99,7 +101,7 @@ biquad_params<T> biquad_highpass(identity<T> frequency, identity<T> Q)
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
{
const T K = std::tan(c_pi<T, 1> * frequency);
const T K2 = K * K;
@@ -119,7 +121,7 @@ biquad_params<T> biquad_bandpass(identity<T> frequency, identity<T> Q)
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
+KFR_FUNCTION biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
{
const T K = std::tan(c_pi<T, 1> * frequency);
const T K2 = K * K;
@@ -140,7 +142,7 @@ biquad_params<T> biquad_notch(identity<T> frequency, identity<T> Q)
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain)
+KFR_FUNCTION biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> gain)
{
biquad_params<T> result;
const T K = std::tan(c_pi<T, 1> * frequency);
@@ -177,7 +179,7 @@ biquad_params<T> biquad_peak(identity<T> frequency, identity<T> Q, identity<T> g
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
+KFR_FUNCTION biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
{
biquad_params<T> result;
const T K = std::tan(c_pi<T, 1> * frequency);
@@ -214,7 +216,7 @@ biquad_params<T> biquad_lowshelf(identity<T> frequency, identity<T> gain)
* @return Biquad filter coefficients
*/
template <typename T = fbase>
-biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
+KFR_FUNCTION biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
{
biquad_params<T> result;
const T K = std::tan(c_pi<T, 1> * frequency);
@@ -243,4 +245,5 @@ biquad_params<T> biquad_highshelf(identity<T> frequency, identity<T> gain)
}
return result;
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/dcremove.hpp b/include/kfr/dsp/dcremove.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup biquad
* @{
*/
/*
@@ -30,11 +30,14 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
template <typename E1, typename T = flt_type<value_type_of<E1>>>
-CMT_INLINE internal::expression_biquads<1, T, E1> dcremove(E1&& e1, double cutoff = 0.00025)
+KFR_INTRINSIC internal::expression_biquads<1, T, E1> dcremove(E1&& e1, double cutoff = 0.00025)
{
const biquad_params<T> bqs[1] = { biquad_highpass(cutoff, 0.5) };
return internal::expression_biquads<1, T, E1>(bqs, std::forward<E1>(e1));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/delay.hpp b/include/kfr/dsp/delay.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
* @{
*/
/*
@@ -30,43 +30,48 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
namespace internal
{
template <size_t delay, typename E>
-struct expression_delay : expression_base<E>
+struct expression_delay : expression_with_arguments<E>
{
using value_type = value_type_of<E>;
using T = value_type;
- using expression_base<E>::expression_base;
+ using expression_with_arguments<E>::expression_with_arguments;
template <size_t N, KFR_ENABLE_IF(N <= delay)>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+ vec_shape<T, N>)
{
vec<T, N> out;
- size_t c = cursor;
- data.ringbuf_read(c, out);
- const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>());
- data.ringbuf_write(cursor, in);
+ size_t c = self.cursor;
+ self.data.ringbuf_read(c, out);
+ const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
+ self.data.ringbuf_write(self.cursor, in);
return out;
}
- vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const
+ friend vec<T, 1> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+ vec_shape<T, 1>)
{
T out;
- size_t c = cursor;
- data.ringbuf_read(c, out);
- const T in = this->argument_first(cinput, index, vec_t<T, 1>())[0];
- data.ringbuf_write(cursor, in);
+ size_t c = self.cursor;
+ self.data.ringbuf_read(c, out);
+ const T in = self.argument_first(cinput, index, vec_shape<T, 1>())[0];
+ self.data.ringbuf_write(self.cursor, in);
return out;
}
template <size_t N, KFR_ENABLE_IF(N > delay)>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ friend vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+ vec_shape<T, N>)
{
vec<T, delay> out;
- size_t c = cursor;
- data.ringbuf_read(c, out);
- const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>());
- data.ringbuf_write(cursor, slice<N - delay, delay>(in));
+ size_t c = self.cursor;
+ self.data.ringbuf_read(c, out);
+ const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
+ self.data.ringbuf_write(self.cursor, slice<N - delay, delay>(in));
return concat_and_slice<0, N>(out, in);
}
@@ -75,18 +80,19 @@ struct expression_delay : expression_base<E>
};
template <typename E>
-struct expression_delay<1, E> : expression_base<E>
+struct expression_delay<1, E> : expression_with_arguments<E>
{
using value_type = value_type_of<E>;
using T = value_type;
- using expression_base<E>::expression_base;
+ using expression_with_arguments<E>::expression_with_arguments;
template <size_t N>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
+ vec_shape<T, N>)
{
- const vec<T, N> in = this->argument_first(cinput, index, vec_t<T, N>());
- const vec<T, N> out = insertleft(data, in);
- data = in[N - 1];
+ const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
+ const vec<T, N> out = insertleft(self.data, in);
+ self.data = in[N - 1];
return out;
}
mutable value_type data = value_type(0);
@@ -103,9 +109,10 @@ struct expression_delay<1, E> : expression_base<E>
* @endcode
*/
template <size_t samples = 1, typename E1>
-CMT_INLINE internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>())
+KFR_INTRINSIC internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>())
{
static_assert(samples >= 1 && samples < 1024, "");
return internal::expression_delay<samples, E1>(std::forward<E1>(e1));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/ebu.hpp b/include/kfr/dsp/ebu.hpp
@@ -1,3 +1,28 @@
+/** @addtogroup ebu
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
#pragma once
#include <vector>
@@ -16,15 +41,17 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Winaccessible-base")
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
template <typename T>
-KFR_SINTRIN T energy_to_loudness(T energy)
+KFR_INTRINSIC T energy_to_loudness(T energy)
{
return T(10) * log10(energy) - T(0.691);
}
template <typename T>
-KFR_SINTRIN T loudness_to_energy(T loudness)
+KFR_INTRINSIC T loudness_to_energy(T loudness)
{
return exp10((loudness + T(0.691)) * T(0.1));
}
@@ -88,8 +115,8 @@ public:
}
private:
- mutable bool m_integrated_cached;
mutable T m_integrated;
+ mutable bool m_integrated_cached;
};
template <typename T>
@@ -98,10 +125,10 @@ struct lra_vec : public univector<T>
private:
void compute() const
{
- m_range_high = -70.0;
- m_range_low = -70.0;
- static const T PRC_LOW = 0.10;
- static const T PRC_HIGH = 0.95;
+ m_range_high = -70;
+ m_range_low = -70;
+ static const T PRC_LOW = T(0.10);
+ static const T PRC_HIGH = T(0.95);
const T z_total = mean(*this);
const T relative_gate = energy_to_loudness(z_total) - 20;
@@ -151,13 +178,13 @@ public:
}
private:
- mutable bool m_lra_cached;
mutable T m_range_low;
mutable T m_range_high;
+ mutable bool m_lra_cached;
};
template <typename T>
-KFR_SINTRIN expression_pointer<T> make_kfilter(int samplerate)
+KFR_INTRINSIC expression_pointer<T> make_kfilter(int samplerate)
{
const biquad_params<T> bq[] = {
biquad_highshelf(T(1681.81 / samplerate), T(+4.0)),
@@ -199,8 +226,8 @@ public:
void reset()
{
- std::fill(m_short_sum_of_squares.begin(), m_short_sum_of_squares.end(), 0);
- std::fill(m_momentary_sum_of_squares.begin(), m_momentary_sum_of_squares.end(), 0);
+ std::fill(m_short_sum_of_squares.begin(), m_short_sum_of_squares.end(), T(0));
+ std::fill(m_momentary_sum_of_squares.begin(), m_momentary_sum_of_squares.end(), T(0));
}
void process_packet(const T* src)
@@ -214,15 +241,15 @@ public:
Speaker get_speaker() const { return m_speaker; }
private:
+ const int m_sample_rate;
const Speaker m_speaker;
const T m_input_gain;
- const int m_sample_rate;
const size_t m_packet_size;
expression_pointer<T> m_kfilter;
- T m_output_energy_gain;
- univector<T> m_buffer;
univector<T> m_short_sum_of_squares;
univector<T> m_momentary_sum_of_squares;
+ T m_output_energy_gain;
+ univector<T> m_buffer;
size_t m_buffer_cursor;
size_t m_short_sum_of_squares_cursor;
size_t m_momentary_sum_of_squares_cursor;
@@ -239,7 +266,7 @@ public:
{
for (Speaker sp : channels)
{
- m_channels.emplace_back(sample_rate, sp, packet_size_factor, 1);
+ m_channels.emplace_back(sample_rate, sp, packet_size_factor, T(1));
}
}
@@ -327,6 +354,7 @@ private:
lra_vec<T> m_lra_buffer;
};
+} // namespace CMT_ARCH_NAME
} // namespace kfr
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
* @{
*/
/*
@@ -30,10 +30,12 @@
#include "../base/memory.hpp"
#include "../base/reduce.hpp"
#include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
template <typename T, size_t Size>
using fir_taps = univector<T, Size>;
@@ -77,7 +79,7 @@ struct state_holder
state_holder() = delete;
state_holder(const state_holder&) = default;
state_holder(state_holder&&) = default;
- constexpr state_holder(const T& state) noexcept : s(state) {}
+ constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
T s;
};
@@ -87,30 +89,32 @@ struct state_holder<T, true>
state_holder() = delete;
state_holder(const state_holder&) = default;
state_holder(state_holder&&) = default;
- constexpr state_holder(const T& state) noexcept : s(state) {}
+ constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
const T& s;
};
template <size_t tapcount, typename T, typename U, typename E1, bool stateless = false, KFR_ARCH_DEP>
-struct expression_short_fir : expression_base<E1>
+struct expression_short_fir : expression_with_arguments<E1>
{
using value_type = U;
expression_short_fir(E1&& e1, const short_fir_state<tapcount, T, U>& state)
- : expression_base<E1>(std::forward<E1>(e1)), state(state)
+ : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state)
{
}
template <size_t N>
- CMT_INLINE vec<U, N> operator()(cinput_t cinput, size_t index, vec_t<U, N> x) const
+ KFR_INTRINSIC friend vec<U, N> get_elements(const expression_short_fir& self, cinput_t cinput,
+ size_t index, vec_shape<U, N> x)
{
- vec<U, N> in = this->argument_first(cinput, index, x);
+ vec<U, N> in = self.argument_first(cinput, index, x);
- vec<U, N> out = in * state.s.taps[0];
- cforeach(csizeseq_t<tapcount - 1, 1>(), [&](auto I) {
- out = out + concat_and_slice<tapcount - 1 - I, N>(state.s.delayline, in) * state.s.taps[I];
+ vec<U, N> out = in * self.state.s.taps.front();
+ cforeach(csizeseq<tapcount - 1, 1>, [&](auto I) {
+ out = out +
+ concat_and_slice<tapcount - 1 - I, N>(self.state.s.delayline, in) * self.state.s.taps[I];
});
- state.s.delayline = concat_and_slice<N, tapcount - 1>(state.s.delayline, in);
+ self.state.s.delayline = concat_and_slice<N, tapcount - 1>(self.state.s.delayline, in);
return out;
}
@@ -118,31 +122,33 @@ struct expression_short_fir : expression_base<E1>
};
template <typename T, typename U, typename E1, bool stateless = false, KFR_ARCH_DEP>
-struct expression_fir : expression_base<E1>
+struct expression_fir : expression_with_arguments<E1>
{
using value_type = U;
expression_fir(E1&& e1, const fir_state<T, U>& state)
- : expression_base<E1>(std::forward<E1>(e1)), state(state)
+ : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state)
{
}
template <size_t N>
- CMT_INLINE vec<U, N> operator()(cinput_t cinput, size_t index, vec_t<U, N> x) const
+ KFR_INTRINSIC friend vec<U, N> get_elements(const expression_fir& self, cinput_t cinput, size_t index,
+ vec_shape<U, N> x)
{
- const size_t tapcount = state.s.taps.size();
- const vec<U, N> input = this->argument_first(cinput, index, x);
+ const size_t tapcount = self.state.s.taps.size();
+ const vec<U, N> input = self.argument_first(cinput, index, x);
vec<U, N> output;
- size_t cursor = state.s.delayline_cursor;
+ size_t cursor = self.state.s.delayline_cursor;
CMT_LOOP_NOUNROLL
for (size_t i = 0; i < N; i++)
{
- state.s.delayline.ringbuf_write(cursor, input[i]);
- output[i] = dotproduct(state.s.taps, state.s.delayline.slice(cursor) /*, tapcount - cursor*/) +
- dotproduct(state.s.taps.slice(tapcount - cursor), state.s.delayline /*, cursor*/);
+ self.state.s.delayline.ringbuf_write(cursor, input[i]);
+ output[i] =
+ dotproduct(self.state.s.taps, self.state.s.delayline.slice(cursor) /*, tapcount - cursor*/) +
+ dotproduct(self.state.s.taps.slice(tapcount - cursor), self.state.s.delayline /*, cursor*/);
}
- state.s.delayline_cursor = cursor;
+ self.state.s.delayline_cursor = cursor;
return output;
}
state_holder<fir_state<T, U>, stateless> state;
@@ -155,7 +161,7 @@ struct expression_fir : expression_base<E1>
* @param taps coefficients for the FIR filter
*/
template <typename T, typename E1, univector_tag Tag>
-CMT_INLINE internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const univector<T, Tag>& taps)
+KFR_INTRINSIC internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const univector<T, Tag>& taps)
{
return internal::expression_fir<T, value_type_of<E1>, E1>(std::forward<E1>(e1), taps.ref());
}
@@ -166,7 +172,7 @@ CMT_INLINE internal::expression_fir<T, value_type_of<E1>, E1> fir(E1&& e1, const
* @param e1 an input expression
*/
template <typename T, typename U, typename E1>
-CMT_INLINE internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, E1&& e1)
+KFR_INTRINSIC internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state, E1&& e1)
{
return internal::expression_fir<T, U, E1, true>(std::forward<E1>(e1), state);
}
@@ -178,7 +184,7 @@ CMT_INLINE internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& state,
* @param taps coefficients for the FIR filter
*/
template <typename T, size_t TapCount, typename E1>
-CMT_INLINE internal::expression_short_fir<next_poweroftwo(TapCount), T, value_type_of<E1>, E1> short_fir(
+KFR_INTRINSIC internal::expression_short_fir<next_poweroftwo(TapCount), T, value_type_of<E1>, E1> short_fir(
E1&& e1, const univector<T, TapCount>& taps)
{
static_assert(TapCount >= 2 && TapCount <= 32, "Use short_fir only for small FIR filters");
@@ -214,4 +220,5 @@ protected:
private:
fir_state<T, U> state;
};
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/fir_design.hpp b/include/kfr/dsp/fir_design.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
* @{
*/
/*
@@ -25,13 +25,15 @@
*/
#pragma once
-#include "../base/sin_cos.hpp"
+#include "../math/sin_cos.hpp"
#include "fir.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
-namespace intrinsics
+namespace internal
{
template <typename T>
void fir_lowpass(univector_ref<T> taps, T cutoff, const expression_pointer<T>& window, bool normalize = true)
@@ -115,11 +117,11 @@ void fir_bandstop(univector_ref<T> taps, T frequency1, T frequency2, const expre
taps = taps * invsum;
}
}
-} // namespace intrinsics
-KFR_I_FN(fir_lowpass)
-KFR_I_FN(fir_highpass)
-KFR_I_FN(fir_bandpass)
-KFR_I_FN(fir_bandstop)
+} // namespace internal
+KFR_I_FN_FULL(fir_lowpass, internal::fir_lowpass)
+KFR_I_FN_FULL(fir_highpass, internal::fir_highpass)
+KFR_I_FN_FULL(fir_bandpass, internal::fir_bandpass)
+KFR_I_FN_FULL(fir_bandstop, internal::fir_bandstop)
/**
* @brief Calculates coefficients for the low-pass FIR filter
@@ -129,10 +131,10 @@ KFR_I_FN(fir_bandstop)
* @param normalize true for normalized coefficients
*/
template <typename T, univector_tag Tag>
-CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
- bool normalize = true)
+KFR_INTRINSIC void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_lowpass(taps.slice(), cutoff, window, normalize);
+ return internal::fir_lowpass(taps.slice(), cutoff, window, normalize);
}
/**
@@ -143,10 +145,10 @@ CMT_INLINE void fir_lowpass(univector<T, Tag>& taps, identity<T> cutoff, const e
* @param normalize true for normalized coefficients
*/
template <typename T, univector_tag Tag>
-CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const expression_pointer<T>& window,
- bool normalize = true)
+KFR_INTRINSIC void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_highpass(taps.slice(), cutoff, window, normalize);
+ return internal::fir_highpass(taps.slice(), cutoff, window, normalize);
}
/**
@@ -158,10 +160,10 @@ CMT_INLINE void fir_highpass(univector<T, Tag>& taps, identity<T> cutoff, const
* @param normalize true for normalized coefficients
*/
template <typename T, univector_tag Tag>
-CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
- const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
+ return internal::fir_bandpass(taps.slice(), frequency1, frequency2, window, normalize);
}
/**
@@ -173,49 +175,50 @@ CMT_INLINE void fir_bandpass(univector<T, Tag>& taps, identity<T> frequency1, id
* @param normalize true for normalized coefficients
*/
template <typename T, univector_tag Tag>
-CMT_INLINE void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
- const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandstop(univector<T, Tag>& taps, identity<T> frequency1, identity<T> frequency2,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
+ return internal::fir_bandstop(taps.slice(), frequency1, frequency2, window, normalize);
}
/**
* @copydoc kfr::fir_lowpass
*/
template <typename T>
-CMT_INLINE void fir_lowpass(const univector_ref<T>& taps, identity<T> cutoff,
- const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_lowpass(const univector_ref<T>& taps, identity<T> cutoff,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_lowpass(taps, cutoff, window, normalize);
+ return internal::fir_lowpass(taps, cutoff, window, normalize);
}
/**
* @copydoc kfr::fir_highpass
*/
template <typename T>
-CMT_INLINE void fir_highpass(const univector_ref<T>& taps, identity<T> cutoff,
- const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_highpass(const univector_ref<T>& taps, identity<T> cutoff,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_highpass(taps, cutoff, window, normalize);
+ return internal::fir_highpass(taps, cutoff, window, normalize);
}
/**
* @copydoc kfr::fir_bandpass
*/
template <typename T>
-CMT_INLINE void fir_bandpass(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
- const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandpass(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_bandpass(taps, frequency1, frequency2, window, normalize);
+ return internal::fir_bandpass(taps, frequency1, frequency2, window, normalize);
}
/**
* @copydoc kfr::fir_bandstop
*/
template <typename T>
-CMT_INLINE void fir_bandstop(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
- const expression_pointer<T>& window, bool normalize = true)
+KFR_INTRINSIC void fir_bandstop(const univector_ref<T>& taps, identity<T> frequency1, identity<T> frequency2,
+ const expression_pointer<T>& window, bool normalize = true)
{
- return intrinsics::fir_bandstop(taps, frequency1, frequency2, window, normalize);
+ return internal::fir_bandstop(taps, frequency1, frequency2, window, normalize);
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/fracdelay.hpp b/include/kfr/dsp/fracdelay.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup fir
* @{
*/
/*
@@ -30,12 +30,16 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
template <typename T, typename E1>
-CMT_INLINE internal::expression_short_fir<2, T, value_type_of<E1>, E1> fracdelay(E1&& e1, T delay)
+KFR_INTRINSIC internal::expression_short_fir<2, T, value_type_of<E1>, E1> fracdelay(E1&& e1, T delay)
{
if (delay < 0)
delay = 0;
univector<T, 2> taps({ 1 - delay, delay });
return internal::expression_short_fir<2, T, value_type_of<E1>, E1>(std::forward<E1>(e1), taps);
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -26,12 +26,15 @@
#pragma once
#include "../base/basic_expressions.hpp"
-#include "../base/complex.hpp"
-#include "../base/sin_cos.hpp"
-#include "../base/vec.hpp"
+#include "../math/sin_cos.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
namespace internal
{
@@ -48,7 +51,7 @@ struct expression_goertzel : output_expression
result.imag(q2 * sin(omega));
}
template <typename U, size_t N>
- CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+ KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<U, N>& x)
{
vec<T, N> in = x;
CMT_LOOP_UNROLL
@@ -85,7 +88,7 @@ struct expression_parallel_goertzel : output_expression
}
}
template <typename U, size_t N>
- CMT_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
+ KFR_MEM_INTRINSIC void operator()(coutput_t, size_t, const vec<U, N>& x)
{
const vec<T, N> in = x;
CMT_LOOP_UNROLL
@@ -103,18 +106,19 @@ struct expression_parallel_goertzel : output_expression
vec<T, width> q1;
vec<T, width> q2;
};
-}; // namespace internal
+} // namespace internal
template <typename T>
-KFR_SINTRIN internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega)
+KFR_INTRINSIC internal::expression_goertzel<T> goertzel(complex<T>& result, identity<T> omega)
{
return internal::expression_goertzel<T>(result, omega);
}
template <typename T, size_t width>
-KFR_SINTRIN internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width],
- const T (&omega)[width])
+KFR_INTRINSIC internal::expression_parallel_goertzel<T, width> goertzel(complex<T> (&result)[width],
+ const T (&omega)[width])
{
return internal::expression_parallel_goertzel<T, width>(result, read<width>(omega));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/interpolation.hpp b/include/kfr/dsp/interpolation.hpp
@@ -1,72 +0,0 @@
-/** @addtogroup dsp
- * @{
- */
-/*
- Copyright (C) 2016 D Levin (https://www.kfrlib.com)
- This file is part of KFR
-
- KFR is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- KFR is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with KFR.
-
- If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
- Buying a commercial license is mandatory as soon as you develop commercial activities without
- disclosing the source code of your own applications.
- See https://www.kfrlib.com for details.
- */
-#pragma once
-
-#include "../base/select.hpp"
-#include "../base/sin_cos.hpp"
-#include "../base/vec.hpp"
-
-namespace kfr
-{
-
-template <typename T, typename M>
-KFR_FUNC T nearest(M mu, T x1, T x2)
-{
- return select(mu < M(0.5), x1, x2);
-}
-
-template <typename T, typename M>
-KFR_FUNC T linear(M mu, T x1, T x2)
-{
- return mix(mu, x1, x2);
-}
-
-template <typename T, typename M>
-KFR_FUNC T cosine(M mu, T x1, T x2)
-{
- return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2);
-}
-
-template <typename T, typename M>
-KFR_FUNC T cubic(M mu, T x0, T x1, T x2, T x3)
-{
- const T a0 = x3 - x2 - x0 + x1;
- const T a1 = x0 - x1 - a0;
- const T a2 = x2 - x0;
- const T a3 = x1;
- return horner(mu, a0, a1, a2, a3);
-}
-
-template <typename T, typename M>
-KFR_FUNC T catmullrom(M mu, T x0, T x1, T x2, T x3)
-{
- const T a0 = T(0.5) * (x3 - x0) - T(1.5) * (x2 - x1);
- const T a1 = x0 - T(2.5) * x1 + T(2) * x2 - T(0.5) * x3;
- const T a2 = T(0.5) * (x2 - x0);
- const T a3 = x1;
- return horner(mu, a0, a1, a2, a3);
-}
-} // namespace kfr
diff --git a/include/kfr/dsp/mixdown.hpp b/include/kfr/dsp/mixdown.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -29,6 +29,9 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
/**
* @brief Returns template expression that returns the sum of all the inputs
*/
@@ -43,12 +46,12 @@ namespace internal
struct stereo_matrix
{
template <typename T, size_t N>
- CMT_INLINE vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const
+ KFR_MEM_INTRINSIC vec<vec<T, 2>, N> operator()(const vec<vec<T, 2>, N>& x) const
{
- return process(x, csizeseq_t<N>());
+ return process(x, csizeseq<N>);
}
template <typename T, size_t N, size_t... indices>
- CMT_INLINE vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const
+ KFR_MEM_INTRINSIC vec<vec<T, 2>, N> process(const vec<vec<T, 2>, N>& x, csizes_t<indices...>) const
{
return vec<vec<T, 2>, N>(hadd(transpose(x[indices] * matrix))...);
}
@@ -79,4 +82,5 @@ Result mixdown_stereo(Left&& left, Right&& right, const f64x2x2& matrix)
return Result(internal::stereo_matrix{ matrix },
pack(std::forward<Left>(left), std::forward<Right>(right)));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/oscillators.hpp b/include/kfr/dsp/oscillators.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup oscillators
* @{
*/
/*
@@ -26,19 +26,21 @@
#pragma once
#include "../base/basic_expressions.hpp"
-#include "../base/sin_cos.hpp"
+#include "../math/sin_cos.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
template <typename T = fbase>
-KFR_FUNC static auto phasor(identity<T> frequency, identity<T> sample_rate, identity<T> phase = 0)
+KFR_FUNCTION static auto phasor(identity<T> frequency, identity<T> sample_rate, identity<T> phase = 0)
{
return fract(counter(phase, frequency / sample_rate));
}
template <typename T = fbase>
-KFR_FUNC static auto phasor(identity<T> frequency)
+KFR_FUNCTION static auto phasor(identity<T> frequency)
{
return phasor(frequency, 1, 0);
}
@@ -46,76 +48,76 @@ KFR_FUNC static auto phasor(identity<T> frequency)
namespace intrinsics
{
template <typename T>
-KFR_FUNC T rawsine(const T& x)
+KFR_INTRINSIC T rawsine(const T& x)
{
return intrinsics::fastsin(x * constants<T>::pi_s(2));
}
template <typename T>
-KFR_FUNC T sinenorm(const T& x)
+KFR_INTRINSIC T sinenorm(const T& x)
{
return intrinsics::rawsine(fract(x));
}
template <typename T>
-KFR_FUNC T sine(const T& x)
+KFR_INTRINSIC T sine(const T& x)
{
return intrinsics::sinenorm(constants<T>::recip_pi_s(1, 2) * x);
}
template <typename T>
-KFR_FUNC T rawsquare(const T& x)
+KFR_INTRINSIC T rawsquare(const T& x)
{
return select(x < T(0.5), T(1), -T(1));
}
template <typename T>
-KFR_FUNC T squarenorm(const T& x)
+KFR_INTRINSIC T squarenorm(const T& x)
{
return intrinsics::rawsquare(fract(x));
}
template <typename T>
-KFR_FUNC T square(const T& x)
+KFR_INTRINSIC T square(const T& x)
{
return intrinsics::squarenorm(constants<T>::recip_pi_s(1, 2) * x);
}
template <typename T>
-KFR_FUNC T rawsawtooth(const T& x)
+KFR_INTRINSIC T rawsawtooth(const T& x)
{
return T(1) - 2 * x;
}
template <typename T>
-KFR_FUNC T sawtoothnorm(const T& x)
+KFR_INTRINSIC T sawtoothnorm(const T& x)
{
return intrinsics::rawsawtooth(fract(x));
}
template <typename T>
-KFR_FUNC T sawtooth(const T& x)
+KFR_INTRINSIC T sawtooth(const T& x)
{
return intrinsics::sawtoothnorm(constants<T>::recip_pi_s(1, 2) * x);
}
template <typename T>
-KFR_FUNC T isawtoothnorm(const T& x)
+KFR_INTRINSIC T isawtoothnorm(const T& x)
{
return T(-1) + 2 * fract(x + 0.5);
}
template <typename T>
-KFR_FUNC T isawtooth(const T& x)
+KFR_INTRINSIC T isawtooth(const T& x)
{
return intrinsics::isawtoothnorm(constants<T>::recip_pi_s(1, 2) * x);
}
template <typename T>
-KFR_FUNC T rawtriangle(const T& x)
+KFR_INTRINSIC T rawtriangle(const T& x)
{
return 1 - abs(4 * x - 2);
}
template <typename T>
-KFR_FUNC T trianglenorm(const T& x)
+KFR_INTRINSIC T trianglenorm(const T& x)
{
return intrinsics::rawtriangle(fract(x + 0.25));
}
template <typename T>
-KFR_FUNC T triangle(const T& x)
+KFR_INTRINSIC T triangle(const T& x)
{
return intrinsics::trianglenorm(constants<T>::recip_pi_s(1, 2) * x);
}
@@ -136,143 +138,145 @@ KFR_I_FN(isawtooth)
KFR_I_FN(isawtoothnorm)
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawsine(const T1& x)
+KFR_FUNCTION T1 rawsine(const T1& x)
{
return intrinsics::rawsine(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawsine, E1> rawsine(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawsine, E1> rawsine(E1&& x)
{
return { fn::rawsine(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sine(const T1& x)
+KFR_FUNCTION T1 sine(const T1& x)
{
return intrinsics::sine(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sine, E1> sine(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sine, E1> sine(E1&& x)
{
return { fn::sine(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sinenorm(const T1& x)
+KFR_FUNCTION T1 sinenorm(const T1& x)
{
return intrinsics::sinenorm(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sinenorm, E1> sinenorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sinenorm, E1> sinenorm(E1&& x)
{
return { fn::sinenorm(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawsquare(const T1& x)
+KFR_FUNCTION T1 rawsquare(const T1& x)
{
return intrinsics::rawsquare(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawsquare, E1> rawsquare(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawsquare, E1> rawsquare(E1&& x)
{
return { fn::rawsquare(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 square(const T1& x)
+KFR_FUNCTION T1 square(const T1& x)
{
return intrinsics::square(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::square, E1> square(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::square, E1> square(E1&& x)
{
return { fn::square(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 squarenorm(const T1& x)
+KFR_FUNCTION T1 squarenorm(const T1& x)
{
return intrinsics::squarenorm(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::squarenorm, E1> squarenorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::squarenorm, E1> squarenorm(E1&& x)
{
return { fn::squarenorm(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawtriangle(const T1& x)
+KFR_FUNCTION T1 rawtriangle(const T1& x)
{
return intrinsics::rawtriangle(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawtriangle, E1> rawtriangle(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawtriangle, E1> rawtriangle(E1&& x)
{
return { fn::rawtriangle(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 triangle(const T1& x)
+KFR_FUNCTION T1 triangle(const T1& x)
{
return intrinsics::triangle(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::triangle, E1> triangle(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::triangle, E1> triangle(E1&& x)
{
return { fn::triangle(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 trianglenorm(const T1& x)
+KFR_FUNCTION T1 trianglenorm(const T1& x)
{
return intrinsics::trianglenorm(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::trianglenorm, E1> trianglenorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::trianglenorm, E1> trianglenorm(E1&& x)
{
return { fn::trianglenorm(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 rawsawtooth(const T1& x)
+KFR_FUNCTION T1 rawsawtooth(const T1& x)
{
return intrinsics::rawsawtooth(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::rawsawtooth, E1> rawsawtooth(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::rawsawtooth, E1> rawsawtooth(E1&& x)
{
return { fn::rawsawtooth(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sawtooth(const T1& x)
+KFR_FUNCTION T1 sawtooth(const T1& x)
{
return intrinsics::sawtooth(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sawtooth, E1> sawtooth(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sawtooth, E1> sawtooth(E1&& x)
{
return { fn::sawtooth(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 sawtoothnorm(const T1& x)
+KFR_FUNCTION T1 sawtoothnorm(const T1& x)
{
return intrinsics::sawtoothnorm(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::sawtoothnorm, E1> sawtoothnorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::sawtoothnorm, E1> sawtoothnorm(E1&& x)
{
return { fn::sawtoothnorm(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 isawtooth(const T1& x)
+KFR_FUNCTION T1 isawtooth(const T1& x)
{
return intrinsics::isawtooth(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::isawtooth, E1> isawtooth(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::isawtooth, E1> isawtooth(E1&& x)
{
return { fn::isawtooth(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_FUNC T1 isawtoothnorm(const T1& x)
+KFR_FUNCTION T1 isawtoothnorm(const T1& x)
{
return intrinsics::isawtoothnorm(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_FUNC internal::expression_function<fn::isawtoothnorm, E1> isawtoothnorm(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::isawtoothnorm, E1> isawtoothnorm(E1&& x)
{
return { fn::isawtoothnorm(), std::forward<E1>(x) };
}
+} // namespace CMT_ARCH_NAME
+
} // namespace kfr
diff --git a/include/kfr/dsp/sample_rate_conversion.hpp b/include/kfr/dsp/sample_rate_conversion.hpp
@@ -25,14 +25,17 @@
*/
#pragma once
-#include "../base/function.hpp"
#include "../base/memory.hpp"
#include "../base/reduce.hpp"
-#include "../base/vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/vec.hpp"
#include "window.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
enum class sample_rate_conversion_quality : int
{
draft = 4,
@@ -52,32 +55,32 @@ struct samplerate_converter
using ftype = subtype<T>;
private:
- KFR_INTRIN ftype window(ftype n) const
+ KFR_MEM_INTRINSIC ftype window(ftype n) const
{
return modzerobessel(kaiser_beta * sqrt(1 - sqr(2 * n - 1))) * reciprocal(modzerobessel(kaiser_beta));
}
- KFR_INTRIN ftype sidelobe_att() const { return kaiser_beta / 0.1102 + 8.7; }
- KFR_INTRIN ftype transition_width() const { return (sidelobe_att() - 8) / (depth - 1) / 2.285; }
+ KFR_MEM_INTRINSIC ftype sidelobe_att() const { return kaiser_beta / 0.1102 + 8.7; }
+ KFR_MEM_INTRINSIC ftype transition_width() const { return (sidelobe_att() - 8) / (depth - 1) / 2.285; }
public:
- static KFR_INTRIN size_t filter_order(sample_rate_conversion_quality quality)
+ static KFR_MEM_INTRINSIC size_t filter_order(sample_rate_conversion_quality quality)
{
- return 1 << (static_cast<int>(quality) + 1);
+ return size_t(1) << (static_cast<int>(quality) + 1);
}
/// @brief Returns sidelobe attenuation for the given quality (in dB)
- static KFR_INTRIN ftype sidelobe_attenuation(sample_rate_conversion_quality quality)
+ static KFR_MEM_INTRINSIC ftype sidelobe_attenuation(sample_rate_conversion_quality quality)
{
return (static_cast<int>(quality) - 3) * ftype(20);
}
/// @brief Returns transition width for the given quality (in rad)
- static KFR_INTRIN ftype transition_width(sample_rate_conversion_quality quality)
+ static KFR_MEM_INTRINSIC ftype transition_width(sample_rate_conversion_quality quality)
{
return (sidelobe_attenuation(quality) - 8) / (filter_order(quality) - 1) / ftype(2.285);
}
- static KFR_INTRIN ftype window_param(sample_rate_conversion_quality quality)
+ static KFR_MEM_INTRINSIC ftype window_param(sample_rate_conversion_quality quality)
{
const ftype att = sidelobe_attenuation(quality);
if (att > 50)
@@ -112,7 +115,8 @@ public:
for (itype j = 0, jj = 0; j < taps; j++)
{
- filter[size_t(j)] = sinc((jj - halftaps) * cutoff * c_pi<ftype, 2>) * window(ftype(jj) / ftype(taps - 1));
+ filter[size_t(j)] =
+ sinc((jj - halftaps) * cutoff * c_pi<ftype, 2>) * window(ftype(jj) / ftype(taps - 1));
jj += size_t(interpolation_factor);
if (jj >= taps)
jj = jj - taps + 1;
@@ -122,25 +126,31 @@ public:
filter = filter * s;
}
- itype input_position_to_intermediate(itype in_pos) const { return in_pos * interpolation_factor; }
- itype output_position_to_intermediate(itype out_pos) const { return out_pos * decimation_factor; }
+ KFR_MEM_INTRINSIC itype input_position_to_intermediate(itype in_pos) const
+ {
+ return in_pos * interpolation_factor;
+ }
+ KFR_MEM_INTRINSIC itype output_position_to_intermediate(itype out_pos) const
+ {
+ return out_pos * decimation_factor;
+ }
- itype input_position_to_output(itype in_pos) const
+ KFR_MEM_INTRINSIC itype input_position_to_output(itype in_pos) const
{
return floor_div(input_position_to_intermediate(in_pos), decimation_factor).quot;
}
- itype output_position_to_input(itype out_pos) const
+ KFR_MEM_INTRINSIC itype output_position_to_input(itype out_pos) const
{
return floor_div(output_position_to_intermediate(out_pos), interpolation_factor).quot;
}
- itype output_size_for_input(itype input_size) const
+ KFR_MEM_INTRINSIC itype output_size_for_input(itype input_size) const
{
return input_position_to_output(input_position + input_size - 1) -
input_position_to_output(input_position - 1);
}
- itype input_size_for_output(itype output_size) const
+ KFR_MEM_INTRINSIC itype input_size_for_output(itype output_size) const
{
return output_position_to_input(output_position + output_size - 1) -
output_position_to_input(output_position - 1);
@@ -183,7 +193,6 @@ public:
const std::lldiv_t input_pos =
floor_div(intermediate_start + interpolation_factor - 1, interpolation_factor);
const itype input_start = input_pos.quot; // first input sample
- const itype input_end = input_start + depth;
const itype tap_start = interpolation_factor - 1 - input_pos.rem;
const univector_ref<T> tap_ptr = filter.slice(static_cast<size_t>(tap_start * depth));
@@ -219,8 +228,8 @@ public:
return required_input_size;
}
- double get_fractional_delay() const { return (taps - 1) * 0.5 / decimation_factor; }
- size_t get_delay() const { return static_cast<size_t>(get_fractional_delay()); }
+ KFR_MEM_INTRINSIC double get_fractional_delay() const { return (taps - 1) * 0.5 / decimation_factor; }
+ KFR_MEM_INTRINSIC size_t get_delay() const { return static_cast<size_t>(get_fractional_delay()); }
ftype kaiser_beta;
itype depth;
@@ -244,130 +253,140 @@ template <size_t factor, size_t offset, typename E>
struct expression_downsample;
template <typename E>
-struct expression_upsample<2, E> : expression_base<E>
+struct expression_upsample<2, E> : expression_with_arguments<E>
{
- using expression_base<E>::expression_base;
+ using expression_with_arguments<E>::expression_with_arguments;
using value_type = value_type_of<E>;
using T = value_type;
- size_t size() const noexcept { return expression_base<E>::size() * 2; }
+ KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() * 2; }
template <size_t N>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_upsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, N>)
{
- const vec<T, N / 2> x = this->argument_first(cinput, index / 2, vec_t<T, N / 2>());
+ const vec<T, N / 2> x = self.argument_first(cinput, index / 2, vec_shape<T, N / 2>());
return interleave(x, zerovector(x));
}
- vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const
+ KFR_INTRINSIC friend vec<T, 1> get_elements(const expression_upsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, 1>)
{
if (index & 1)
return 0;
else
- return this->argument_first(cinput, index / 2, vec_t<T, 1>());
+ return self.argument_first(cinput, index / 2, vec_shape<T, 1>());
}
};
template <typename E>
-struct expression_upsample<4, E> : expression_base<E>
+struct expression_upsample<4, E> : expression_with_arguments<E>
{
- using expression_base<E>::expression_base;
+ using expression_with_arguments<E>::expression_with_arguments;
using value_type = value_type_of<E>;
using T = value_type;
- size_t size() const noexcept { return expression_base<E>::size() * 4; }
+ KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() * 4; }
template <size_t N>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_upsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, N>) CMT_NOEXCEPT
{
- const vec<T, N / 4> x = this->argument_first(cinput, index / 4, vec_t<T, N / 4>());
+ const vec<T, N / 4> x = self.argument_first(cinput, index / 4, vec_shape<T, N / 4>());
const vec<T, N / 2> xx = interleave(x, zerovector(x));
return interleave(xx, zerovector(xx));
}
- vec<T, 2> operator()(cinput_t cinput, size_t index, vec_t<T, 2>) const
+ KFR_INTRINSIC friend vec<T, 2> get_elements(const expression_upsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, 2>) CMT_NOEXCEPT
{
switch (index & 3)
{
case 0:
- return interleave(this->argument_first(cinput, index / 4, vec_t<T, 1>()), zerovector<T, 1>());
+ return interleave(self.argument_first(cinput, index / 4, vec_shape<T, 1>()), zerovector<T, 1>());
case 3:
- return interleave(zerovector<T, 1>(), this->argument_first(cinput, index / 4, vec_t<T, 1>()));
+ return interleave(zerovector<T, 1>(), self.argument_first(cinput, index / 4, vec_shape<T, 1>()));
default:
return 0;
}
}
- vec<T, 1> operator()(cinput_t cinput, size_t index, vec_t<T, 1>) const
+ KFR_INTRINSIC friend vec<T, 1> get_elements(const expression_upsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, 1>) CMT_NOEXCEPT
{
if (index & 3)
return 0;
else
- return this->argument_first(cinput, index / 4, vec_t<T, 1>());
+ return self.argument_first(cinput, index / 4, vec_shape<T, 1>());
}
};
template <typename E, size_t offset>
-struct expression_downsample<2, offset, E> : expression_base<E>
+struct expression_downsample<2, offset, E> : expression_with_arguments<E>
{
- using expression_base<E>::expression_base;
+ using expression_with_arguments<E>::expression_with_arguments;
using value_type = value_type_of<E>;
using T = value_type;
- size_t size() const noexcept { return expression_base<E>::size() / 2; }
+ KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() / 2; }
template <size_t N>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_downsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, N>) CMT_NOEXCEPT
{
- const vec<T, N* 2> x = this->argument_first(cinput, index * 2, vec_t<T, N * 2>());
- return x.shuffle(csizeseq_t<N, offset, 2>());
+ const vec<T, N* 2> x = self.argument_first(cinput, index * 2, vec_shape<T, N * 2>());
+ return x.shuffle(csizeseq<N, offset, 2>);
}
};
template <typename E, size_t offset>
-struct expression_downsample<4, offset, E> : expression_base<E>
+struct expression_downsample<4, offset, E> : expression_with_arguments<E>
{
- using expression_base<E>::expression_base;
+ using expression_with_arguments<E>::expression_with_arguments;
using value_type = value_type_of<E>;
using T = value_type;
- size_t size() const noexcept { return expression_base<E>::size() / 4; }
+ KFR_MEM_INTRINSIC size_t size() const CMT_NOEXCEPT { return expression_with_arguments<E>::size() / 4; }
template <size_t N>
- vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_downsample& self, cinput_t cinput,
+ size_t index, vec_shape<T, N>) CMT_NOEXCEPT
{
- const vec<T, N* 4> x = this->argument_first(cinput, index * 4, vec_t<T, N * 4>());
- return x.shuffle(csizeseq_t<N, offset, 4>());
+ const vec<T, N* 4> x = self.argument_first(cinput, index * 4, vec_shape<T, N * 4>());
+ return x.shuffle(csizeseq<N, offset, 4>);
}
};
} // namespace internal
template <typename E1, size_t offset = 0>
-CMT_INLINE internal::expression_downsample<2, offset, E1> downsample2(E1&& e1, csize_t<offset> = csize_t<0>())
+KFR_FUNCTION internal::expression_downsample<2, offset, E1> downsample2(E1&& e1,
+ csize_t<offset> = csize_t<0>())
{
return internal::expression_downsample<2, offset, E1>(std::forward<E1>(e1));
}
template <typename E1, size_t offset = 0>
-CMT_INLINE internal::expression_downsample<4, offset, E1> downsample4(E1&& e1, csize_t<offset> = csize_t<0>())
+KFR_FUNCTION internal::expression_downsample<4, offset, E1> downsample4(E1&& e1,
+ csize_t<offset> = csize_t<0>())
{
return internal::expression_downsample<4, offset, E1>(std::forward<E1>(e1));
}
template <typename E1>
-CMT_INLINE internal::expression_upsample<2, E1> upsample2(E1&& e1)
+KFR_FUNCTION internal::expression_upsample<2, E1> upsample2(E1&& e1)
{
return internal::expression_upsample<2, E1>(std::forward<E1>(e1));
}
template <typename E1>
-CMT_INLINE internal::expression_upsample<4, E1> upsample4(E1&& e1)
+KFR_FUNCTION internal::expression_upsample<4, E1> upsample4(E1&& e1)
{
return internal::expression_upsample<4, E1>(std::forward<E1>(e1));
}
template <typename T = fbase>
-inline samplerate_converter<T> sample_rate_converter(sample_rate_conversion_quality quality,
- size_t interpolation_factor, size_t decimation_factor,
- subtype<T> scale = subtype<T>(1),
- subtype<T> cutoff = 0.5f)
+KFR_FUNCTION samplerate_converter<T> sample_rate_converter(sample_rate_conversion_quality quality,
+ size_t interpolation_factor,
+ size_t decimation_factor,
+ subtype<T> scale = subtype<T>(1),
+ subtype<T> cutoff = 0.5f)
{
using itype = typename samplerate_converter<T>::itype;
return samplerate_converter<T>(quality, itype(interpolation_factor), itype(decimation_factor), scale,
@@ -376,12 +395,13 @@ inline samplerate_converter<T> sample_rate_converter(sample_rate_conversion_qual
// Deprecated in 0.9.2
template <typename T = fbase>
-inline samplerate_converter<T> resampler(sample_rate_conversion_quality quality, size_t interpolation_factor,
- size_t decimation_factor, subtype<T> scale = subtype<T>(1),
- subtype<T> cutoff = 0.5f)
+KFR_FUNCTION samplerate_converter<T> resampler(sample_rate_conversion_quality quality,
+ size_t interpolation_factor, size_t decimation_factor,
+ subtype<T> scale = subtype<T>(1), subtype<T> cutoff = 0.5f)
{
using itype = typename samplerate_converter<T>::itype;
return samplerate_converter<T>(quality, itype(interpolation_factor), itype(decimation_factor), scale,
cutoff);
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/speaker.hpp b/include/kfr/dsp/speaker.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -27,6 +27,8 @@
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
enum class Speaker : int
{
@@ -93,4 +95,5 @@ enum class SpeakerArrangement : int
Music81 = 27,
Arr102 = 28
};
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/special.hpp b/include/kfr/dsp/special.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -26,16 +26,19 @@
#pragma once
#include "../base/basic_expressions.hpp"
-#include "../base/operators.hpp"
-#include "../base/vec.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
/**
* @brief Returns expression template that generates a unit impulse
*/
template <typename T = int>
-static auto unitimpulse()
+auto unitimpulse()
{
return lambda<T>([](cinput_t, size_t index, auto x) {
if (index == 0)
@@ -46,7 +49,7 @@ static auto unitimpulse()
}
template <typename T = fbase>
-static auto jaehne_arg(size_t size)
+auto jaehne_arg(size_t size)
{
return truncate(constants<T>::pi_s(1, 2) * sqr(linspace(T(0), T(size), size, false)) / size, size);
}
@@ -56,13 +59,13 @@ static auto jaehne_arg(size_t size)
* Generates the sine with linearly increasing frequency from 0hz to nyquist frequency.
*/
template <typename T = fbase>
-static auto jaehne(identity<T> magn, size_t size)
+auto jaehne(identity<T> magn, size_t size)
{
return magn * sin(jaehne_arg<T>(size));
}
template <typename T = fbase>
-static auto swept_arg(size_t size)
+auto swept_arg(size_t size)
{
return truncate(constants<T>::pi_s(1, 4) * sqr(sqr(linspace(T(0), T(size), size, false)) / sqr(T(size))) *
T(size),
@@ -74,8 +77,9 @@ static auto swept_arg(size_t size)
* Generates the sine with logarithmically increasing frequency from 0hz to nyquist frequency.
*/
template <typename T = fbase>
-static auto swept(identity<T> magn, size_t size)
+auto swept(identity<T> magn, size_t size)
{
return magn * sin(swept_arg<T>(size));
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/units.hpp b/include/kfr/dsp/units.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -25,41 +25,43 @@
*/
#pragma once
-#include "../base/abs.hpp"
#include "../base/basic_expressions.hpp"
-#include "../base/log_exp.hpp"
-#include "../base/vec.hpp"
+#include "../math/abs.hpp"
+#include "../math/log_exp.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
using sample_rate_t = double;
namespace intrinsics
{
template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF amp_to_dB(const T& amp)
+KFR_INTRINSIC TF amp_to_dB(const T& amp)
{
return log(static_cast<TF>(abs(amp))) * subtype<TF>(8.6858896380650365530225783783322);
// return T( 20.0 ) * log10( level );
}
template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF dB_to_amp(const T& dB)
+KFR_INTRINSIC TF dB_to_amp(const T& dB)
{
return exp(dB * subtype<TF>(0.11512925464970228420089957273422));
// return exp10( dB / 20 );
}
template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF amp_to_dB(const T& amp, const T& offset)
+KFR_INTRINSIC TF amp_to_dB(const T& amp, const T& offset)
{
return log_fmadd(static_cast<TF>(abs(amp)), subtype<TF>(8.6858896380650365530225783783322), offset);
// return T( 20.0 ) * log10( level );
}
template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset)
+KFR_INTRINSIC TF dB_to_amp(const T& dB, const T& offset)
{
auto offs = -subtype<TF>(0.11512925464970228420089957273422) * offset;
return exp_fmadd(dB, subtype<TF>(0.11512925464970228420089957273422), offs);
@@ -67,13 +69,13 @@ KFR_SINTRIN TF dB_to_amp(const T& dB, const T& offset)
}
template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout power_to_dB(const T& x)
+KFR_INTRINSIC Tout power_to_dB(const T& x)
{
return log(static_cast<Tout>(abs(x))) * (10 * c_recip_log_10<Tout>);
}
template <typename T, typename Tout = flt_type<T>>
-KFR_SINTRIN Tout dB_to_power(const T& x)
+KFR_INTRINSIC Tout dB_to_power(const T& x)
{
if (x == -c_infinity<Tout>)
return 0.0;
@@ -82,7 +84,7 @@ KFR_SINTRIN Tout dB_to_power(const T& x)
}
template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF note_to_hertz(const T& note)
+KFR_INTRINSIC TF note_to_hertz(const T& note)
{
const subtype<TF> offset = 2.1011784386926213177653145771814;
@@ -90,7 +92,7 @@ KFR_SINTRIN TF note_to_hertz(const T& note)
}
template <typename T, typename TF = flt_type<T>>
-KFR_SINTRIN TF hertz_to_note(const T& hertz)
+KFR_INTRINSIC TF hertz_to_note(const T& hertz)
{
const subtype<TF> offset = -36.376316562295915248836189714583;
@@ -98,7 +100,7 @@ KFR_SINTRIN TF hertz_to_note(const T& hertz)
}
template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>>
-KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz)
+KFR_INTRINSIC Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tunehertz)
{
const Tc offset = log(tunehertz) - tunenote * subtype<Tc>(0.05776226504666210911810267678818);
@@ -106,7 +108,7 @@ KFR_SINTRIN Tc note_to_hertz(const T1& note, const T2& tunenote, const T3& tuneh
}
template <typename T1, typename T2, typename T3, typename Tc = flt_type<common_type<T1, T2, T3, f32>>>
-KFR_SINTRIN Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz)
+KFR_INTRINSIC Tc hertz_to_note(const T1& hertz, const T2& tunenote, const T3& tunehertz)
{
const Tc offset = tunenote - log(tunehertz) * subtype<Tc>(17.312340490667560888319096172023);
@@ -121,74 +123,75 @@ KFR_I_FN(power_to_dB)
KFR_I_FN(dB_to_power)
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> note_to_hertz(const T1& x)
+KFR_FUNCTION flt_type<T1> note_to_hertz(const T1& x)
{
return intrinsics::note_to_hertz(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::note_to_hertz, E1> note_to_hertz(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::note_to_hertz, E1> note_to_hertz(E1&& x)
{
return { fn::note_to_hertz(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> hertz_to_note(const T1& x)
+KFR_FUNCTION flt_type<T1> hertz_to_note(const T1& x)
{
return intrinsics::hertz_to_note(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::hertz_to_note, E1> hertz_to_note(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::hertz_to_note, E1> hertz_to_note(E1&& x)
{
return { fn::hertz_to_note(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> amp_to_dB(const T1& x)
+KFR_FUNCTION flt_type<T1> amp_to_dB(const T1& x)
{
return intrinsics::amp_to_dB(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::amp_to_dB, E1> amp_to_dB(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::amp_to_dB, E1> amp_to_dB(E1&& x)
{
return { fn::amp_to_dB(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> dB_to_amp(const T1& x)
+KFR_FUNCTION flt_type<T1> dB_to_amp(const T1& x)
{
return intrinsics::dB_to_amp(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::dB_to_amp, E1> dB_to_amp(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::dB_to_amp, E1> dB_to_amp(E1&& x)
{
return { fn::dB_to_amp(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> power_to_dB(const T1& x)
+KFR_FUNCTION flt_type<T1> power_to_dB(const T1& x)
{
return intrinsics::power_to_dB(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::power_to_dB, E1> power_to_dB(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::power_to_dB, E1> power_to_dB(E1&& x)
{
return { fn::power_to_dB(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN flt_type<T1> dB_to_power(const T1& x)
+KFR_FUNCTION flt_type<T1> dB_to_power(const T1& x)
{
return intrinsics::dB_to_power(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::dB_to_power, E1> dB_to_power(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::dB_to_power, E1> dB_to_power(E1&& x)
{
return { fn::dB_to_power(), std::forward<E1>(x) };
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/waveshaper.hpp b/include/kfr/dsp/waveshaper.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -25,12 +25,15 @@
*/
#pragma once
-#include "../base/clamp.hpp"
-#include "../base/hyperbolic.hpp"
-#include "../base/operators.hpp"
+#include "../math/clamp.hpp"
+#include "../math/hyperbolic.hpp"
+#include "../simd/operators.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
template <typename E1>
inline auto waveshaper_hardclip(E1&& input, double clip_level)
{
@@ -44,7 +47,7 @@ inline auto waveshaper_tanh(E1&& input, double saturation)
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-CMT_FUNC flt_type<T1> saturate_I(const T1& x)
+KFR_FUNCTION flt_type<T1> saturate_I(const T1& x)
{
const flt_type<T1> xx = -1 / (abs(static_cast<flt_type<T1>>(x)) + 1) + 1;
return mulsign(xx, static_cast<flt_type<T1>>(x));
@@ -52,7 +55,7 @@ CMT_FUNC flt_type<T1> saturate_I(const T1& x)
KFR_FN(saturate_I)
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-CMT_FUNC flt_type<T1> saturate_II(const T1& x)
+KFR_FUNCTION flt_type<T1> saturate_II(const T1& x)
{
const flt_type<T1> xx = sqr(abs(static_cast<flt_type<T1>>(x)) + 1);
return mulsign((xx - 1) / (xx + 1), static_cast<flt_type<T1>>(x));
@@ -60,13 +63,13 @@ CMT_FUNC flt_type<T1> saturate_II(const T1& x)
KFR_FN(saturate_II)
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_FUNC internal::expression_function<fn::saturate_II, E1> saturate_I(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::saturate_II, E1> saturate_I(E1&& x)
{
return { fn::saturate_I(), std::forward<E1>(x) };
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-CMT_FUNC internal::expression_function<fn::saturate_II, E1> saturate_II(E1&& x)
+KFR_FUNCTION internal::expression_function<fn::saturate_II, E1> saturate_II(E1&& x)
{
return { fn::saturate_II(), std::forward<E1>(x) };
}
@@ -88,4 +91,5 @@ inline auto waveshaper_poly(E1&& input, fbase c1, fbase c3, Cs... cs)
{
return horner_odd(input, c1, c3, static_cast<fbase>(cs)...);
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/weighting.hpp b/include/kfr/dsp/weighting.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup dsp_extra
* @{
*/
/*
@@ -25,16 +25,19 @@
*/
#pragma once
-#include "../base/operators.hpp"
-#include "../base/sqrt.hpp"
+#include "../math/sqrt.hpp"
+#include "../simd/operators.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
+
namespace intrinsics
{
template <typename T>
-KFR_SINTRIN T weight_a_unnorm(T f)
+KFR_INTRINSIC T weight_a_unnorm(T f)
{
const T f2 = pow2(f);
const T nom = pow2(12200) * pow4(f);
@@ -46,13 +49,13 @@ template <typename T>
const static T weight_a_gain = reciprocal(weight_a_unnorm(T(1000.0)));
template <typename T>
-KFR_SINTRIN T aweighting(T f)
+KFR_INTRINSIC T aweighting(T f)
{
return weight_a_unnorm(f) * weight_a_gain<subtype<T>>;
}
template <typename T>
-KFR_SINTRIN T weight_b_unnorm(T f)
+KFR_INTRINSIC T weight_b_unnorm(T f)
{
const T f2 = pow2(f);
const T nom = pow2(12200) * pow3(f);
@@ -65,13 +68,13 @@ template <typename T>
const static T weight_b_gain = reciprocal(weight_b_unnorm(T(1000.0)));
template <typename T>
-KFR_SINTRIN T bweighting(T f)
+KFR_INTRINSIC T bweighting(T f)
{
return weight_b_unnorm(f) * weight_b_gain<subtype<T>>;
}
template <typename T>
-KFR_SINTRIN T weight_c_unnorm(T f)
+KFR_INTRINSIC T weight_c_unnorm(T f)
{
const T f2 = pow2(f);
const T nom = pow2(12200) * f2;
@@ -84,7 +87,7 @@ template <typename T>
const static T weight_c_gain = reciprocal(weight_c_unnorm(T(1000.0)));
template <typename T>
-KFR_SINTRIN T cweighting(T f)
+KFR_INTRINSIC T cweighting(T f)
{
return weight_c_unnorm(f) * weight_c_gain<subtype<T>>;
}
@@ -94,38 +97,39 @@ KFR_I_FN(bweighting)
KFR_I_FN(cweighting)
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 aweighting(const T1& x)
+KFR_INTRINSIC T1 aweighting(const T1& x)
{
return intrinsics::aweighting(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::aweighting, E1> aweighting(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::aweighting, E1> aweighting(E1&& x)
{
return { fn::aweighting(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 bweighting(const T1& x)
+KFR_INTRINSIC T1 bweighting(const T1& x)
{
return intrinsics::bweighting(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::bweighting, E1> bweighting(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::bweighting, E1> bweighting(E1&& x)
{
return { fn::bweighting(), std::forward<E1>(x) };
}
template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
-KFR_INTRIN T1 cweighting(const T1& x)
+KFR_INTRINSIC T1 cweighting(const T1& x)
{
return intrinsics::cweighting(x);
}
template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
-KFR_INTRIN internal::expression_function<fn::cweighting, E1> cweighting(E1&& x)
+KFR_INTRINSIC internal::expression_function<fn::cweighting, E1> cweighting(E1&& x)
{
return { fn::cweighting(), std::forward<E1>(x) };
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/dsp/window.hpp b/include/kfr/dsp/window.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup dsp
+/** @addtogroup window
* @{
*/
/*
@@ -25,15 +25,17 @@
*/
#pragma once
-#include "../base/log_exp.hpp"
-#include "../base/modzerobessel.hpp"
#include "../base/pointer.hpp"
-#include "../base/sin_cos.hpp"
-#include "../base/sqrt.hpp"
-#include "../base/vec.hpp"
+#include "../math/log_exp.hpp"
+#include "../math/modzerobessel.hpp"
+#include "../math/sin_cos.hpp"
+#include "../math/sqrt.hpp"
+#include "../simd/vec.hpp"
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
enum class window_type
{
@@ -125,11 +127,12 @@ struct expression_rectangular : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t, size_t index, vec_t<T, N>) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_rectangular& self, cinput_t,
+ size_t index, vec_shape<T, N>)
{
using TI = utype<T>;
- const vec<TI, N> i = enumerate(vec<TI, N>()) + cast<TI>(index);
- return select(i < cast<TI>(m_size), T(1), T(0));
+ const vec<TI, N> i = enumerate(vec<TI, N>()) + static_cast<TI>(index);
+ return select(i < static_cast<TI>(self.m_size), T(1), T(0));
}
size_t size() const { return m_size; }
@@ -147,9 +150,10 @@ struct expression_triangular : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_triangular& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return 1 - abs(linspace(cinput, index, y));
+ return 1 - abs(get_elements(self.linspace, cinput, index, y));
}
size_t size() const { return m_size; }
@@ -168,9 +172,10 @@ struct expression_bartlett : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bartlett& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return 1 - abs(linspace(cinput, index, y));
+ return 1 - abs(get_elements(self.linspace, cinput, index, y));
}
size_t size() const { return m_size; }
@@ -189,9 +194,10 @@ struct expression_cosine : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_cosine& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return sin(c_pi<T> * linspace(cinput, index, y));
+ return sin(c_pi<T> * get_elements(self.linspace, cinput, index, y));
}
size_t size() const { return m_size; }
@@ -210,9 +216,10 @@ struct expression_hann : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_hann& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return T(0.5) * (T(1) - cos(c_pi<T, 2> * linspace(cinput, index, y)));
+ return T(0.5) * (T(1) - cos(c_pi<T, 2> * get_elements(self.linspace, cinput, index, y)));
}
size_t size() const { return m_size; }
@@ -231,9 +238,10 @@ struct expression_bartlett_hann : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bartlett_hann& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- const vec<T, N> xx = linspace(cinput, index, y);
+ const vec<T, N> xx = get_elements(self.linspace, cinput, index, y);
return T(0.62) - T(0.48) * abs(xx - T(0.5)) + T(0.38) * cos(c_pi<T, 2> * (xx - T(0.5)));
}
size_t size() const { return m_size; }
@@ -253,9 +261,11 @@ struct expression_hamming : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_hamming& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return alpha - (T(1.0) - alpha) * (cos(c_pi<T, 2> * linspace(cinput, index, y)));
+ return self.alpha -
+ (T(1.0) - self.alpha) * (cos(c_pi<T, 2> * get_elements(self.linspace, cinput, index, y)));
}
size_t size() const { return m_size; }
@@ -275,9 +285,10 @@ struct expression_bohman : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_bohman& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- const vec<T, N> n = abs(linspace(cinput, index, y));
+ const vec<T, N> n = abs(get_elements(self.linspace, cinput, index, y));
return (T(1) - n) * cos(c_pi<T> * n) + (T(1) / c_pi<T>)*sin(c_pi<T> * n);
}
size_t size() const { return m_size; }
@@ -297,10 +308,11 @@ struct expression_blackman : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_blackman& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- const vec<T, N> n = linspace(cinput, index, y);
- return a0 - a1 * cos(c_pi<T, 2> * n) + a2 * cos(c_pi<T, 4> * n);
+ const vec<T, N> n = get_elements(self.linspace, cinput, index, y);
+ return self.a0 - self.a1 * cos(c_pi<T, 2> * n) + self.a2 * cos(c_pi<T, 4> * n);
}
size_t size() const { return m_size; }
@@ -320,9 +332,10 @@ struct expression_blackman_harris : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_blackman_harris& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
+ const vec<T, N> n = get_elements(self.linspace, cinput, index, y) * c_pi<T, 2>;
return T(0.35875) - T(0.48829) * cos(n) + T(0.14128) * cos(2 * n) - T(0.01168) * cos(3 * n);
}
size_t size() const { return m_size; }
@@ -343,9 +356,11 @@ struct expression_kaiser : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_kaiser& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return modzerobessel(beta * sqrt(1 - sqr(linspace(cinput, index, y)))) * m;
+ return modzerobessel(self.beta * sqrt(1 - sqr(get_elements(self.linspace, cinput, index, y)))) *
+ self.m;
}
size_t size() const { return m_size; }
@@ -366,9 +381,10 @@ struct expression_flattop : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_flattop& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- const vec<T, N> n = linspace(cinput, index, y) * c_pi<T, 2>;
+ const vec<T, N> n = get_elements(self.linspace, cinput, index, y) * c_pi<T, 2>;
constexpr T a0 = 1;
constexpr T a1 = 1.93;
constexpr T a2 = 1.29;
@@ -393,9 +409,10 @@ struct expression_gaussian : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_gaussian& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return exp(T(-0.5) * sqr(alpha * linspace(cinput, index, y)));
+ return exp(T(-0.5) * sqr(self.alpha * get_elements(self.linspace, cinput, index, y)));
}
size_t size() const { return m_size; }
@@ -416,9 +433,10 @@ struct expression_lanczos : input_expression
{
}
template <size_t N>
- CMT_INLINE vec<T, N> operator()(cinput_t cinput, size_t index, vec_t<T, N> y) const
+ KFR_INTRINSIC friend vec<T, N> get_elements(const expression_lanczos& self, cinput_t cinput,
+ size_t index, vec_shape<T, N> y)
{
- return sinc(linspace(cinput, index, y));
+ return sinc(get_elements(self.linspace, cinput, index, y));
}
size_t size() const { return m_size; }
@@ -458,7 +476,7 @@ KFR_WINDOW_BY_TYPE(lanczos)
/**
* @brief Returns template expression that generates Rrectangular window of length @c size
*/
-CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t size)
+KFR_FUNCTION internal::expression_rectangular<fbase> window_rectangular(size_t size)
{
return internal::expression_rectangular<fbase>(size, fbase());
}
@@ -467,7 +485,7 @@ CMT_INLINE internal::expression_rectangular<fbase> window_rectangular(size_t siz
* @brief Returns template expression that generates Triangular window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_triangular<T> window_triangular(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_triangular<T>(size);
}
@@ -476,7 +494,7 @@ CMT_INLINE internal::expression_triangular<T> window_triangular(size_t size, cty
* @brief Returns template expression that generates Bartlett window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_bartlett<T>(size);
}
@@ -485,7 +503,7 @@ CMT_INLINE internal::expression_bartlett<T> window_bartlett(size_t size, ctype_t
* @brief Returns template expression that generates Cosine window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_cosine<T>(size);
}
@@ -494,7 +512,7 @@ CMT_INLINE internal::expression_cosine<T> window_cosine(size_t size, ctype_t<T>
* @brief Returns template expression that generates Hann window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_hann<T>(size);
}
@@ -503,7 +521,8 @@ CMT_INLINE internal::expression_hann<T> window_hann(size_t size, ctype_t<T> = ct
* @brief Returns template expression that generates Bartlett-Hann window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_bartlett_hann<T> window_bartlett_hann(size_t size,
+ ctype_t<T> = ctype_t<T>())
{
return internal::expression_bartlett_hann<T>(size);
}
@@ -513,8 +532,8 @@ CMT_INLINE internal::expression_bartlett_hann<T> window_bartlett_hann(size_t siz
* alpha
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54,
- ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_hamming<T> window_hamming(size_t size, identity<T> alpha = 0.54,
+ ctype_t<T> = ctype_t<T>())
{
return internal::expression_hamming<T>(size, alpha);
}
@@ -523,7 +542,7 @@ CMT_INLINE internal::expression_hamming<T> window_hamming(size_t size, identity<
* @brief Returns template expression that generates Bohman window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_bohman<T>(size);
}
@@ -533,7 +552,7 @@ CMT_INLINE internal::expression_bohman<T> window_bohman(size_t size, ctype_t<T>
* alpha
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_blackman<T> window_blackman(
+KFR_FUNCTION internal::expression_blackman<T> window_blackman(
size_t size, identity<T> alpha = 0.16, window_symmetry symmetry = window_symmetry::symmetric,
ctype_t<T> = ctype_t<T>())
{
@@ -544,7 +563,7 @@ CMT_INLINE internal::expression_blackman<T> window_blackman(
* @brief Returns template expression that generates Blackman-Harris window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
+KFR_FUNCTION internal::expression_blackman_harris<T> window_blackman_harris(
size_t size, window_symmetry symmetry = window_symmetry::symmetric, ctype_t<T> = ctype_t<T>())
{
return internal::expression_blackman_harris<T>(size, T(), symmetry);
@@ -555,8 +574,8 @@ CMT_INLINE internal::expression_blackman_harris<T> window_blackman_harris(
* beta
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5),
- ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_kaiser<T> window_kaiser(size_t size, identity<T> beta = T(0.5),
+ ctype_t<T> = ctype_t<T>())
{
return internal::expression_kaiser<T>(size, beta);
}
@@ -565,7 +584,7 @@ CMT_INLINE internal::expression_kaiser<T> window_kaiser(size_t size, identity<T>
* @brief Returns template expression that generates Flat top window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_flattop<T>(size);
}
@@ -575,8 +594,8 @@ CMT_INLINE internal::expression_flattop<T> window_flattop(size_t size, ctype_t<T
* alpha
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5,
- ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_gaussian<T> window_gaussian(size_t size, identity<T> alpha = 2.5,
+ ctype_t<T> = ctype_t<T>())
{
return internal::expression_gaussian<T>(size, alpha);
}
@@ -585,7 +604,7 @@ CMT_INLINE internal::expression_gaussian<T> window_gaussian(size_t size, identit
* @brief Returns template expression that generates Lanczos window of length @c size
*/
template <typename T = fbase>
-CMT_INLINE internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
+KFR_FUNCTION internal::expression_lanczos<T> window_lanczos(size_t size, ctype_t<T> = ctype_t<T>())
{
return internal::expression_lanczos<T>(size);
}
@@ -615,6 +634,7 @@ CMT_NOINLINE expression_pointer<T> window(size_t size, window_type type, identit
return to_pointer(
typename internal::window_by_type<window>::template type<T>(size, win_param, symmetry));
},
- fn::returns<expression_pointer<T>>());
+ fn_generic::returns<expression_pointer<T>>());
}
+} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/ext/console_colors.hpp b/include/kfr/ext/console_colors.hpp
@@ -1,162 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <cstdio>
-
-//#define CONSOLE_COLORS_FORCE_ASCII
-
-#if defined _WIN32 && !defined PRINT_COLORED_FORCE_ASCII
-#define USE_WIN32_API
-#endif
-
-#if defined(USE_WIN32_API)
-
-namespace win32_lite
-{
-typedef void* HANDLE;
-typedef uint32_t DWORD;
-
-#define WIN32_LITE_STD_INPUT_HANDLE ((win32_lite::DWORD)-10)
-#define WIN32_LITE_STD_OUTPUT_HANDLE ((win32_lite::DWORD)-11)
-#define WIN32_LITE_STD_ERROR_HANDLE ((win32_lite::DWORD)-12)
-
-#define WIN32_LITE_DECLSPEC_IMPORT __declspec(dllimport)
-
-#define WIN32_LITE_WINAPI __stdcall
-
-typedef short SHORT;
-typedef unsigned short WORD;
-typedef int WINBOOL;
-
-extern "C"
-{
- WIN32_LITE_DECLSPEC_IMPORT HANDLE WIN32_LITE_WINAPI GetStdHandle(DWORD nStdHandle);
- WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleTextAttribute(HANDLE hConsoleOutput,
- WORD wAttributes);
-}
-} // namespace win32_lite
-
-#endif
-
-namespace console_colors
-{
-
-enum text_color : uint32_t
-{
- Black = 0x00,
- DarkBlue = 0x01,
- DarkGreen = 0x02,
- DarkCyan = 0x03,
- DarkRed = 0x04,
- DarkMagenta = 0x05,
- DarkYellow = 0x06,
- LightGrey = 0x07,
- Gray = 0x08,
- Blue = 0x09,
- Green = 0x0A,
- Cyan = 0x0B,
- Red = 0x0C,
- Magenta = 0x0D,
- Yellow = 0x0E,
- White = 0x0F,
- BgBlack = 0x00,
- BgDarkBlue = 0x10,
- BgDarkGreen = 0x20,
- BgDarkCyan = 0x30,
- BgDarkRed = 0x40,
- BgDarkMagenta = 0x50,
- BgDarkYellow = 0x60,
- BgLightGrey = 0x70,
- BgGray = 0x80,
- BgBlue = 0x90,
- BgGreen = 0xA0,
- BgCyan = 0xB0,
- BgRed = 0xC0,
- BgMagenta = 0xD0,
- BgYellow = 0xE0,
- BgWhite = 0xF0,
-
- Normal = BgBlack | LightGrey
-};
-
-enum console_buffer
-{
- ConsoleStdOutput,
- ConsoleStdError
-};
-
-struct console_color
-{
-public:
- console_color(text_color c, console_buffer console = ConsoleStdOutput)
- : m_old(get(console)), m_console(console)
- {
- set(c, m_console);
- }
-
- ~console_color() { set(m_old, m_console); }
-
-private:
- text_color get(console_buffer console = ConsoleStdOutput) { return saved_color(); }
-
- void set(text_color new_color, console_buffer console = ConsoleStdOutput)
- {
-#ifdef USE_WIN32_API
- win32_lite::SetConsoleTextAttribute(win32_lite::GetStdHandle(console == ConsoleStdOutput
- ? WIN32_LITE_STD_OUTPUT_HANDLE
- : WIN32_LITE_STD_ERROR_HANDLE),
- static_cast<win32_lite::WORD>(new_color));
-#else
- if (new_color != Normal)
- {
- uint8_t t = new_color & 0xF;
- uint8_t b = (new_color & 0xF0) >> 4;
- uint8_t tnum = 30 + ((t & 1) << 2 | (t & 2) | (t & 4) >> 2);
- uint8_t bnum = 40 + ((b & 1) << 2 | (b & 2) | (b & 4) >> 2);
- if (t & 8)
- tnum += 60;
- if (b & 8)
- bnum += 60;
- std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[%d;%dm", tnum, bnum);
- }
- else
- {
- std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[0m");
- }
-#endif
- saved_color() = new_color;
- }
-
- text_color m_old;
- console_buffer m_console;
- static text_color& saved_color()
- {
- static text_color color = Normal;
- return color;
- }
-};
-
-template <text_color color, console_buffer console = ConsoleStdOutput>
-struct console_color_tpl : public console_color
-{
-public:
- console_color_tpl() : console_color(color, console) {}
-
-private:
-};
-
-typedef console_color_tpl<DarkBlue> darkblue_text;
-typedef console_color_tpl<DarkGreen> darkgreen_text;
-typedef console_color_tpl<DarkCyan> darkcyan_text;
-typedef console_color_tpl<DarkRed> darkred_text;
-typedef console_color_tpl<DarkMagenta> darkmagenta_text;
-typedef console_color_tpl<DarkYellow> darkyellow_text;
-typedef console_color_tpl<LightGrey> lightgrey_text;
-typedef console_color_tpl<Gray> gray_text;
-typedef console_color_tpl<Blue> blue_text;
-typedef console_color_tpl<Green> green_text;
-typedef console_color_tpl<Cyan> cyan_text;
-typedef console_color_tpl<Red> red_text;
-typedef console_color_tpl<Magenta> magenta_text;
-typedef console_color_tpl<Yellow> yellow_text;
-typedef console_color_tpl<White> white_text;
-} // namespace console_colors
diff --git a/include/kfr/ext/double_double.hpp b/include/kfr/ext/double_double.hpp
@@ -1,86 +0,0 @@
-#pragma once
-
-#include <cmath>
-
-struct double_double
-{
- double hi, lo;
-
- constexpr double_double(double x) noexcept : hi(x), lo(0.0) {}
- constexpr double_double(float x) noexcept : hi(x), lo(0.0) {}
- constexpr double_double(double hi, double lo) noexcept : hi(hi + lo), lo((hi - (hi + lo)) + lo) {}
- constexpr operator double() const noexcept { return hi + lo; }
- constexpr operator float() const noexcept { return hi + lo; }
-
- constexpr friend double_double operator-(const double_double& x) noexcept { return { -x.hi, -x.lo }; }
- constexpr friend double_double operator+(const double_double& x, const double_double& y) noexcept
- {
- const double sum = x.hi + y.hi;
- return { sum, std::abs(x.hi) > std::abs(y.hi) ? (((x.hi - sum) + y.hi) + y.lo) + x.lo
- : (((y.hi - sum) + x.hi) + x.lo) + y.lo };
- }
- constexpr friend double_double operator-(const double_double& x, const double_double& y) noexcept
- {
- const double diff = x.hi - y.hi;
- return { diff, std::abs(x.hi) > std::abs(y.hi) ? (((x.hi - diff) - y.hi) - y.lo) + x.lo
- : (((-y.hi - diff) + x.hi) + x.lo) - y.lo };
- }
- constexpr friend double_double operator*(const double_double& x, const double_double& y) noexcept
- {
- const double_double c = mul(x.hi, y.hi);
- const double cc = (x.hi * y.lo + x.lo * y.hi) + c.lo;
- return { c.hi, cc };
- }
- constexpr friend double_double operator/(const double_double& x, const double_double& y) noexcept
- {
- const double c = x.hi / y.hi;
- const double_double u = mul(c, y.hi);
- const double cc = ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) / y.hi;
- return { c, cc };
- }
-
-#if defined _MSC_VER && !defined __clang__
-#define DOUBLEDOUBLE_CONSTEXPR
-#else
-#define DOUBLEDOUBLE_CONSTEXPR constexpr
-#endif
-
- DOUBLEDOUBLE_CONSTEXPR bool isinf() const noexcept { return std::isinf(hi); }
- DOUBLEDOUBLE_CONSTEXPR bool isnan() const noexcept { return std::isnan(hi) || std::isnan(lo); }
-
- DOUBLEDOUBLE_CONSTEXPR double ulp(float value) const noexcept
- {
- if (std::isnan(value) && isnan())
- return 0.0;
- if (std::isinf(value) && isinf() && (std::copysign(1.0f, value) == std::copysign(1.0, hi)))
- return 0.0;
- if (std::nexttoward(value, 0.0) == 0.0)
- return 1.0;
- return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
- }
- DOUBLEDOUBLE_CONSTEXPR double ulp(double value) const noexcept
- {
- if (std::isnan(value) && isnan())
- return 0.0;
- if (std::isinf(value) && isinf() && (std::copysign(1.0, value) == std::copysign(1.0, hi)))
- return 0.0;
- if (std::nexttoward(value, 0.0) == 0.0)
- return 1.0;
- return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
- }
-
-private:
- constexpr static double_double splitprec(double x) noexcept
- {
- const double p = x * 1.34217729e8;
- const double h = (x - p) + p;
- return { h, x - h };
- }
- constexpr static double_double mul(double x, double y) noexcept
- {
- const double_double xx = splitprec(x);
- const double_double yy = splitprec(y);
- const double z = x * y;
- return { z, ((xx.hi * yy.hi - z) + xx.hi * yy.lo + xx.lo * yy.hi) + xx.lo * yy.lo };
- }
-};
diff --git a/include/kfr/io/audiofile.hpp b/include/kfr/io/audiofile.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup audio_io
* @{
*/
/*
@@ -28,8 +28,8 @@
#include "../base/basic_expressions.hpp"
#include "../base/conversion.hpp"
#include "../base/univector.hpp"
-#include "../base/vec.hpp"
#include "../cometa/ctti.hpp"
+#include "../simd/vec.hpp"
#include "file.hpp"
#ifndef KFR_ENABLE_WAV
@@ -64,10 +64,8 @@ struct audio_format
struct audio_format_and_length : audio_format
{
using audio_format::audio_format;
-#ifdef CMT_COMPILER_MSVC
- audio_format_and_length() noexcept {}
-#endif
- audio_format_and_length(const audio_format& fmt) : audio_format(fmt) {}
+ constexpr audio_format_and_length() CMT_NOEXCEPT {}
+ constexpr audio_format_and_length(const audio_format& fmt) : audio_format(fmt) {}
imax length = 0; // in samples
};
@@ -95,39 +93,43 @@ struct audio_writer : public abstract_writer<T>
virtual void close() = 0;
};
-namespace internal
+namespace internal_generic
{
#if KFR_ENABLE_WAV
-static size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData, size_t bytesToWrite)
+static inline size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData,
+ size_t bytesToWrite)
{
return file->write(pData, bytesToWrite);
}
-static drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset, drwav_seek_origin origin)
+static inline drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset,
+ drwav_seek_origin origin)
{
return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current);
}
-static size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead)
+static inline size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead)
{
return file->read(pBufferOut, bytesToRead);
}
-static drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset, drwav_seek_origin origin)
+static inline drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset,
+ drwav_seek_origin origin)
{
return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current);
}
#endif
#if KFR_ENABLE_FLAC
-static size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead)
+static inline size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut,
+ size_t bytesToRead)
{
return file->read(pBufferOut, bytesToRead);
}
-static drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset,
- drflac_seek_origin origin)
+static inline drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset,
+ drflac_seek_origin origin)
{
return file->seek(offset, origin == drflac_seek_origin_start ? seek_origin::begin : seek_origin::current);
}
#endif
-} // namespace internal
+} // namespace internal_generic
#if KFR_ENABLE_WAV
/// @brief WAV format writer
@@ -139,17 +141,19 @@ struct audio_writer_wav : audio_writer<T>
: writer(std::move(writer)), f(nullptr), fmt(fmt)
{
drwav_data_format wav_fmt;
- wav_fmt.channels = fmt.channels;
- wav_fmt.sampleRate = fmt.samplerate;
+ wav_fmt.channels = static_cast<drwav_uint32>(fmt.channels);
+ wav_fmt.sampleRate = static_cast<drwav_uint32>(fmt.samplerate);
wav_fmt.format =
fmt.type >= audio_sample_type::first_float ? DR_WAVE_FORMAT_IEEE_FLOAT : DR_WAVE_FORMAT_PCM;
- wav_fmt.bitsPerSample = audio_sample_bit_depth(fmt.type);
+ wav_fmt.bitsPerSample = static_cast<drwav_uint32>(audio_sample_bit_depth(fmt.type));
wav_fmt.container = fmt.use_w64 ? drwav_container_w64 : drwav_container_riff;
- f = drwav_open_write(&wav_fmt, (drwav_write_proc)&internal::drwav_writer_write_proc,
- (drwav_seek_proc)&internal::drwav_writer_seek_proc, this->writer.get());
+ f = drwav_open_write(&wav_fmt, (drwav_write_proc)&internal_generic::drwav_writer_write_proc,
+ (drwav_seek_proc)&internal_generic::drwav_writer_seek_proc, this->writer.get());
}
~audio_writer_wav() { close(); }
+ using audio_writer<T>::write;
+
/// @brief Write data to underlying binary writer
size_t write(const T* data, size_t size) override
{
@@ -184,7 +188,7 @@ struct audio_writer_wav : audio_writer<T>
imax tell() const override { return fmt.length; }
- bool seek(imax position, seek_origin origin) override { return false; }
+ bool seek(imax, seek_origin) override { return false; }
private:
std::shared_ptr<abstract_writer<>> writer;
@@ -199,8 +203,8 @@ struct audio_reader_wav : audio_reader<T>
/// @brief Constructs WAV reader
audio_reader_wav(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader))
{
- f = drwav_open((drwav_read_proc)&internal::drwav_reader_read_proc,
- (drwav_seek_proc)&internal::drwav_reader_seek_proc, this->reader.get());
+ f = drwav_open((drwav_read_proc)&internal_generic::drwav_reader_read_proc,
+ (drwav_seek_proc)&internal_generic::drwav_reader_seek_proc, this->reader.get());
fmt.channels = f->channels;
fmt.samplerate = f->sampleRate;
fmt.length = f->totalSampleCount / fmt.channels;
@@ -307,8 +311,8 @@ struct audio_reader_flac : audio_reader<T>
/// @brief Constructs FLAC reader
audio_reader_flac(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader))
{
- f = drflac_open((drflac_read_proc)&internal::drflac_reader_read_proc,
- (drflac_seek_proc)&internal::drflac_reader_seek_proc, this->reader.get());
+ f = drflac_open((drflac_read_proc)&internal_generic::drflac_reader_read_proc,
+ (drflac_seek_proc)&internal_generic::drflac_reader_seek_proc, this->reader.get());
fmt.channels = f->channels;
fmt.samplerate = f->sampleRate;
fmt.length = f->totalSampleCount / fmt.channels;
diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup binary_io
* @{
*/
/*
@@ -25,9 +25,9 @@
*/
#pragma once
-#include "../base/function.hpp"
#include "../base/univector.hpp"
-#include "../base/vec.hpp"
+#include "../simd/impl/function.hpp"
+#include "../simd/vec.hpp"
#include <cstdio>
#include <string>
#include <vector>
@@ -63,6 +63,7 @@ inline FILE* fopen_portable(const filepath_char* path, const filepath_char* mode
#ifdef CMT_OS_WIN
FILE* f = nullptr;
errno_t e = _wfopen_s(&f, path, mode);
+ (void)e;
return f;
#else
return fopen(path, mode);
@@ -98,6 +99,14 @@ struct abstract_stream
bool seek(imax offset, int origin) { return seek(offset, static_cast<seek_origin>(origin)); }
};
+namespace internal_generic
+{
+struct empty
+{
+};
+
+} // namespace internal_generic
+
/// @brief Base class for all typed readers
template <typename T = void>
struct abstract_reader : abstract_stream<T>
@@ -117,6 +126,10 @@ struct abstract_reader : abstract_stream<T>
this->read(result);
return result;
}
+ bool read(conditional<is_void<T>::value, internal_generic::empty, T>& data)
+ {
+ return read(&data, 1) == 1;
+ }
};
/// @brief Base class for all typed writers
@@ -131,6 +144,10 @@ struct abstract_writer : abstract_stream<T>
return write(data.data(), data.size());
}
size_t write(univector_ref<const T>&& data) { return write(data.data(), data.size()); }
+ bool write(const conditional<is_void<T>::value, internal_generic::empty, T>& data)
+ {
+ return write(&data, 1) == 1;
+ }
};
template <typename From, typename To = void>
@@ -207,6 +224,8 @@ struct file_reader : abstract_reader<T>
~file_reader() override {}
size_t read(T* data, size_t size) final { return fread(data, element_size<T>(), size, handle.file); }
+ using abstract_reader<T>::read;
+
imax tell() const final { return IO_TELL_64(handle.file); }
bool seek(imax offset, seek_origin origin) final
{
@@ -221,6 +240,8 @@ struct file_writer : abstract_writer<T>
{
file_writer(file_handle&& handle) : handle(std::move(handle)) {}
~file_writer() override {}
+
+ using abstract_writer<T>::write;
size_t write(const T* data, size_t size) final
{
return fwrite(data, element_size<T>(), size, handle.file);
diff --git a/include/kfr/io/impl/audiofile-impl.cpp b/include/kfr/io/impl/audiofile-impl.cpp
@@ -25,6 +25,8 @@
*/
#include "../audiofile.hpp"
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wimplicit-fallthrough")
#if defined(KFR_ENABLE_WAV) && KFR_ENABLE_WAV
#define DR_WAV_NO_STDIO
@@ -37,3 +39,5 @@
#define DR_FLAC_NO_STDIO
#include "../dr/dr_flac.h"
#endif
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/io/python_plot.hpp b/include/kfr/io/python_plot.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup plotting
* @{
*/
/*
@@ -24,8 +24,8 @@
See https://www.kfrlib.com for details.
*/
#pragma once
-#include "../base/vec.hpp"
#include "../cometa/string.hpp"
+#include "../simd/vec.hpp"
#include <cstdlib>
#ifdef CMT_OS_WIN
@@ -38,7 +38,7 @@
namespace kfr
{
-namespace internal
+namespace internal_generic
{
CMT_PRAGMA_GNU(GCC diagnostic push)
#if CMT_HAS_WARNING("-Wdeprecated-declarations")
@@ -51,7 +51,7 @@ void python(const std::string& name, const std::string& code)
std::string filename;
{
char curdir[1024];
- cross_getcwd(curdir, arraysize(curdir));
+ (void)cross_getcwd(curdir, arraysize(curdir));
filename = curdir;
}
#ifdef CMT_OS_WIN
@@ -64,7 +64,7 @@ void python(const std::string& name, const std::string& code)
FILE* f = fopen(filename.c_str(), "w");
fwrite(code.c_str(), 1, code.size(), f);
fclose(f);
- std::system(("python \"" + filename + "\"").c_str());
+ (void)std::system(("python \"" + filename + "\"").c_str());
}
CMT_PRAGMA_GNU(GCC diagnostic pop)
@@ -78,7 +78,7 @@ inline T flush_to_zero(T value)
{
return static_cast<double>(value);
}
-} // namespace internal
+} // namespace internal_generic
inline std::string concat_args() { return {}; }
@@ -106,7 +106,7 @@ void plot_show(const std::string& name, const std::string& wavfile, const std::s
std::string ss;
ss += python_prologue() + "dspplot.plot(" + concat_args("r'" + wavfile + "'", options) + ")\n";
- internal::python(name, ss);
+ internal_generic::python(name, ss);
print("done\n");
}
@@ -125,12 +125,12 @@ void plot_show(const std::string& name, const T& x, const std::string& options =
std::string ss;
ss += python_prologue() + "data = [\n";
for (size_t i = 0; i < array.size(); i++)
- ss += as_string(fmt<'g', 20, 17>(internal::flush_to_zero(array[i])), ",\n");
+ ss += as_string(fmt<'g', 20, 17>(internal_generic::flush_to_zero(array[i])), ",\n");
ss += "]\n";
ss += "dspplot.plot(" + concat_args("data", options) + ")\n";
- internal::python(name, ss);
+ internal_generic::python(name, ss);
print("done\n");
}
@@ -170,7 +170,7 @@ void perfplot_show(const std::string& name, T1&& data, T2&& labels, const std::s
ss += "dspplot.perfplot(" + concat_args("data, labels", options) + ")\n";
- internal::python(name, ss);
+ internal_generic::python(name, ss);
print("done\n");
}
diff --git a/include/kfr/io/tostring.hpp b/include/kfr/io/tostring.hpp
@@ -1,4 +1,4 @@
-/** @addtogroup io
+/** @addtogroup string_io
* @{
*/
/*
@@ -25,15 +25,50 @@
*/
#pragma once
-#include "../base/complex.hpp"
#include "../base/univector.hpp"
-#include "../base/vec.hpp"
#include "../cometa/string.hpp"
+#include "../simd/complex.hpp"
+#include "../simd/vec.hpp"
#include <cmath>
namespace cometa
{
+template <>
+struct representation<cometa::special_value>
+{
+ using type = std::string;
+ static std::string get(const cometa::special_value& value)
+ {
+ using cometa::special_constant;
+ switch (value.c)
+ {
+ case special_constant::undefined:
+ return "undefined";
+ case special_constant::default_constructed:
+ return "default_constructed";
+ case special_constant::infinity:
+ return "infinity";
+ case special_constant::neg_infinity:
+ return "neg_infinity";
+ case special_constant::min:
+ return "min";
+ case special_constant::max:
+ return "max";
+ case special_constant::neg_max:
+ return "neg_max";
+ case special_constant::lowest:
+ return "lowest";
+ case special_constant::integer:
+ return as_string(value.ll);
+ case special_constant::floating_point:
+ return as_string(value.d);
+ default:
+ return "unknown";
+ }
+ }
+};
+
namespace details
{
@@ -157,10 +192,21 @@ struct representation<kfr::univector<T, Tag>>
return details::array_to_string(value.data(), value.size());
}
};
+template <typename T, size_t Size>
+struct representation<std::array<T, Size>>
+{
+ using type = std::string;
+ static std::string get(const std::array<T, Size>& value)
+ {
+ return details::array_to_string(value.data(), value.size());
+ }
+};
} // namespace cometa
namespace kfr
{
+inline namespace CMT_ARCH_NAME
+{
namespace internal
{
@@ -205,6 +251,7 @@ inline internal::expression_printer printer() { return internal::expression_prin
/// @brief Returns an output expression that prints the values with their types (used for debug)
inline internal::expression_debug_printer debug_printer() { return internal::expression_debug_printer(); }
+} // namespace CMT_ARCH_NAME
/// @brief Converts dB value to string (uses oo for infinity symbol)
template <typename T>
diff --git a/include/kfr/kfr.h b/include/kfr/kfr.h
@@ -0,0 +1,70 @@
+/** @addtogroup utility
+ * @{
+ */
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cident.h"
+
+#define KFR_VERSION_MAJOR 3
+#define KFR_VERSION_MINOR 0
+#define KFR_VERSION_PATCH 5
+#define KFR_VERSION_LABEL "rc"
+
+#define KFR_VERSION_STRING \
+ CMT_STRINGIFY(KFR_VERSION_MAJOR) \
+ "." CMT_STRINGIFY(KFR_VERSION_MINOR) "." CMT_STRINGIFY(KFR_VERSION_PATCH) "-" KFR_VERSION_LABEL
+#define KFR_VERSION (KFR_VERSION_MAJOR * 10000 + KFR_VERSION_MINOR * 100 + KFR_VERSION_PATCH)
+
+#if defined DEBUG || defined KFR_DEBUG
+#define KFR_DEBUG_STR " debug"
+#elif defined NDEBUG || defined KFR_NDEBUG
+#define KFR_DEBUG_STR " optimized"
+#else
+#define KFR_DEBUG_STR ""
+#endif
+
+#define KFR_NATIVE_INTRINSICS 1
+
+#if defined CMT_COMPILER_CLANG && !defined CMT_DISABLE_CLANG_EXT
+#define CMT_CLANG_EXT
+#endif
+
+#ifdef KFR_NATIVE_INTRINSICS
+#define KFR_BUILD_DETAILS_1 " +in"
+#else
+#define KFR_BUILD_DETAILS_1 ""
+#endif
+
+#ifdef CMT_CLANG_EXT
+#define KFR_BUILD_DETAILS_2 " +ve"
+#else
+#define KFR_BUILD_DETAILS_2 ""
+#endif
+
+#define KFR_VERSION_FULL \
+ "KFR " KFR_VERSION_STRING KFR_DEBUG_STR \
+ " " CMT_STRINGIFY(CMT_ARCH_NAME) " " CMT_ARCH_BITNESS_NAME " (" CMT_COMPILER_FULL_NAME "/" CMT_OS_NAME \
+ ")" KFR_BUILD_DETAILS_1 KFR_BUILD_DETAILS_2
+
+#ifdef __cplusplus
+namespace kfr
+{
+/// @brief KFR version string
+constexpr const char version_string[] = KFR_VERSION_STRING;
+
+constexpr int version_major = KFR_VERSION_MAJOR;
+constexpr int version_minor = KFR_VERSION_MINOR;
+constexpr int version_patch = KFR_VERSION_PATCH;
+constexpr int version = KFR_VERSION;
+
+/// @brief KFR version string including architecture and compiler name
+constexpr const char version_full[] = KFR_VERSION_FULL;
+} // namespace kfr
+#endif
+
+#define KFR_INTRINSIC CMT_INTRINSIC
+#define KFR_MEM_INTRINSIC CMT_MEM_INTRINSIC
+#define KFR_FUNCTION CMT_FUNCTION
diff --git a/include/kfr/math.hpp b/include/kfr/math.hpp
@@ -22,4 +22,24 @@
*/
#pragma once
-#include "base.hpp"
+#include "simd.hpp"
+
+#include "math/abs.hpp"
+#include "math/asin_acos.hpp"
+#include "math/atan.hpp"
+#include "math/clamp.hpp"
+#include "math/compiletime.hpp"
+#include "math/complex_math.hpp"
+#include "math/gamma.hpp"
+#include "math/hyperbolic.hpp"
+#include "math/interpolation.hpp"
+#include "math/log_exp.hpp"
+#include "math/logical.hpp"
+#include "math/min_max.hpp"
+#include "math/modzerobessel.hpp"
+#include "math/round.hpp"
+#include "math/saturation.hpp"
+#include "math/select.hpp"
+#include "math/sin_cos.hpp"
+#include "math/sqrt.hpp"
+#include "math/tan.hpp"
diff --git a/include/kfr/math/abs.hpp b/include/kfr/math/abs.hpp
@@ -0,0 +1,54 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/abs.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the absolute value of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 abs(const T1& x)
+{
+ return intrinsics::abs(x);
+}
+
+/**
+ * @brief Returns template expression that returns the absolute value of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::abs, E1> abs(E1&& x)
+{
+ return { fn::abs(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/math/asin_acos.hpp b/include/kfr/math/asin_acos.hpp
@@ -0,0 +1,71 @@
+/** @addtogroup trigonometric
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/asin_acos.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the arc sine of x. The returned angle is in the range \f$-\pi/2\f$ through \f$\pi/2\f$.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC flt_type<T1> asin(const T1& x)
+{
+ return intrinsics::asin(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc sine of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::asin, E1> asin(E1&& x)
+{
+ return { fn::asin(), std::forward<E1>(x) };
+}
+/**
+ * @brief Returns the arc cosine of x. The returned angle is in the range 0 through \f$\pi\f$.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC flt_type<T1> acos(const T1& x)
+{
+ return intrinsics::acos(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc cosine of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::acos, E1> acos(E1&& x)
+{
+ return { fn::acos(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/math/atan.hpp b/include/kfr/math/atan.hpp
@@ -0,0 +1,110 @@
+/** @addtogroup trigonometric
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/atan.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the arc tangent of x. The returned angle is in the range \f$-\pi/2\f$ through
+ * \f$\pi/2\f$.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> atan(const T1& x)
+{
+ return intrinsics::atan(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::atan, E1> atan(E1&& x)
+{
+ return { fn::atan(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the arc tangent of the x, expressed in degrees. The returned angle is in the range -90
+ * through 90.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> atandeg(const T1& x)
+{
+ return intrinsics::atandeg(x);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of the x, expressed in degrees.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::atandeg, E1> atandeg(E1&& x)
+{
+ return { fn::atandeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION common_type<T1, T2> atan2(const T1& x, const T2& y)
+{
+ return intrinsics::atan2(x, y);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of y/x.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::atan2, E1, E2> atan2(E1&& x, E2&& y)
+{
+ return { fn::atan2(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the arc tangent of y/x (expressed in degrees) using the signs of arguments to determine the
+ * correct quadrant.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION common_type<T1, T2> atan2deg(const T1& x, const T2& y)
+{
+ return intrinsics::atan2deg(x, y);
+}
+
+/**
+ * @brief Returns template expression that returns the arc tangent of y/x (expressed in degrees).
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::atan2deg, E1, E2> atan2deg(E1&& x, E2&& y)
+{
+ return { fn::atan2deg(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/clamp.hpp b/include/kfr/math/clamp.hpp
@@ -0,0 +1,65 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/clamp.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the first argument clamped to a range [lo, hi]
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
+ typename Tout = common_type<T1, T2, T3>>
+KFR_INTRINSIC Tout clamp(const T1& x, const T2& lo, const T3& hi)
+{
+ return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(lo), static_cast<Tout>(hi));
+}
+
+/// @brief Creates an expression that returns the first argument clamped to a range [lo, hi]
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::clamp, E1, E2, E3> clamp(E1&& x, E2&& lo, E3&& hi)
+{
+ return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(lo), std::forward<E3>(hi) };
+}
+
+/// @brief Returns the first argument clamped to a range [0, hi]
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout clamp(const T1& x, const T2& hi)
+{
+ return intrinsics::clamp(static_cast<Tout>(x), static_cast<Tout>(hi));
+}
+
+/// @brief Creates an expression that returns the first argument clamped to a range [0, hi]
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::clamp, E1, E2> clamp(E1&& x, E2&& hi)
+{
+ return { fn::clamp(), std::forward<E1>(x), std::forward<E2>(hi) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/compiletime.hpp b/include/kfr/math/compiletime.hpp
@@ -0,0 +1,84 @@
+/** @addtogroup math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "../simd/constants.hpp"
+#include "../simd/operators.hpp"
+#include "../simd/types.hpp"
+
+namespace kfr
+{
+
+namespace compiletime
+{
+
+template <typename T>
+constexpr inline T select(bool c, T x, T y)
+{
+ return c ? x : y;
+}
+template <typename T>
+constexpr inline T trunc(T x)
+{
+ return static_cast<T>(static_cast<long long>(x));
+}
+template <typename T>
+constexpr inline T abs(T x)
+{
+ return x < T() ? -x : x;
+}
+template <typename T>
+constexpr inline T mulsign(T x, T y)
+{
+ return y < T() ? -x : x;
+}
+template <typename T>
+constexpr inline T sin(T x)
+{
+ x = x - trunc(x / c_pi<T, 2>) * c_pi<T, 2>;
+ constexpr T c2 = -0.16665853559970855712890625;
+ constexpr T c4 = +8.31427983939647674560546875e-3;
+ constexpr T c6 = -1.85423981747590005397796630859375e-4;
+
+ x -= c_pi<T>;
+ T y = abs(x);
+ y = select(y > c_pi<T, 1, 2>, c_pi<T> - y, y);
+ y = mulsign(y, -x);
+
+ const T y2 = y * y;
+ T formula = c6;
+ const T y3 = y2 * y;
+ formula = fmadd(formula, y2, c4);
+ formula = fmadd(formula, y2, c2);
+ formula = formula * y3 + y;
+ return formula;
+}
+template <typename T>
+constexpr inline T cos(T x)
+{
+ return sin(x + c_pi<T, 1, 2>);
+}
+} // namespace compiletime
+} // namespace kfr
diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp
@@ -0,0 +1,410 @@
+/** @addtogroup complex
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../simd/complex.hpp"
+#include "abs.hpp"
+#include "atan.hpp"
+#include "hyperbolic.hpp"
+#include "log_exp.hpp"
+#include "min_max.hpp"
+#include "select.hpp"
+#include "sin_cos.hpp"
+#include "sqrt.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> csin(const vec<complex<T>, N>& x)
+{
+ return ccomp(sincos(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> csinh(const vec<complex<T>, N>& x)
+{
+ return ccomp(sinhcosh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> ccos(const vec<complex<T>, N>& x)
+{
+ return ccomp(negodd(cossin(cdecom(cdupreal(x))) * coshsinh(cdecom(cdupimag(x)))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
+{
+ return ccomp(coshsinh(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabs(const vec<complex<T>, N>& x)
+{
+ const vec<T, N* 2> xx = sqr(cdecom(x));
+ return sqrt(even(xx) + odd(xx));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> carg(const vec<complex<T>, N>& x)
+{
+ const vec<T, N* 2> xx = cdecom(x);
+ return atan2(even(xx), odd(xx));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> clog(const vec<complex<T>, N>& x)
+{
+ return make_complex(log(cabs(x)), carg(x));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> clog2(const vec<complex<T>, N>& x)
+{
+ return clog(x) * avoid_odr_use(c_recip_log_2<T>);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> clog10(const vec<complex<T>, N>& x)
+{
+ return clog(x) * avoid_odr_use(c_recip_log_10<T>);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cexp(const vec<complex<T>, N>& x)
+{
+ return ccomp(exp(cdecom(cdupreal(x))) * cossin(cdecom(cdupimag(x))));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cexp2(const vec<complex<T>, N>& x)
+{
+ return cexp(x * avoid_odr_use(c_log_2<T>));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cexp10(const vec<complex<T>, N>& x)
+{
+ return cexp(x * avoid_odr_use(c_log_10<T>));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> polar(const vec<complex<T>, N>& x)
+{
+ return make_complex(cabs(x), carg(x));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
+{
+ return cdupreal(x) * ccomp(cossin(cdecom(cdupimag(x))));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabsdup(const vec<T, N>& x)
+{
+ x = sqr(x);
+ return sqrt(x + swap<2>(x));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> csqrt(const vec<complex<T>, N>& x)
+{
+ const vec<T, N> t = (cabsdup(cdecom(x)) + cdecom(cnegimag(cdupreal(x)))) * T(0.5);
+ return ccomp(select(dupodd(x) < T(), cdecom(cnegimag(ccomp(t))), t));
+}
+
+KFR_HANDLE_SCALAR(cconj)
+KFR_HANDLE_SCALAR(csin)
+KFR_HANDLE_SCALAR(csinh)
+KFR_HANDLE_SCALAR(ccos)
+KFR_HANDLE_SCALAR(ccosh)
+KFR_HANDLE_SCALAR(clog)
+KFR_HANDLE_SCALAR(clog2)
+KFR_HANDLE_SCALAR(clog10)
+KFR_HANDLE_SCALAR(cexp)
+KFR_HANDLE_SCALAR(cexp2)
+KFR_HANDLE_SCALAR(cexp10)
+KFR_HANDLE_SCALAR(polar)
+KFR_HANDLE_SCALAR(cartesian)
+KFR_HANDLE_SCALAR(csqrt)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabs(const vec<T, N>& a)
+{
+ return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a)));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> carg(const vec<T, N>& a)
+{
+ return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a)));
+}
+template <typename T1>
+KFR_INTRINSIC realtype<T1> cabs(const T1& a)
+{
+ using vecout = vec1<T1>;
+ return to_scalar(intrinsics::cabs(vecout(a)));
+}
+template <typename T1>
+KFR_INTRINSIC realtype<T1> carg(const T1& a)
+{
+ using vecout = vec1<T1>;
+ return to_scalar(intrinsics::carg(vecout(a)));
+}
+} // namespace intrinsics
+
+KFR_I_FN(cconj)
+KFR_I_FN(csin)
+KFR_I_FN(csinh)
+KFR_I_FN(ccos)
+KFR_I_FN(ccosh)
+KFR_I_FN(cabs)
+KFR_I_FN(carg)
+KFR_I_FN(clog)
+KFR_I_FN(clog2)
+KFR_I_FN(clog10)
+KFR_I_FN(cexp)
+KFR_I_FN(cexp2)
+KFR_I_FN(cexp10)
+KFR_I_FN(polar)
+KFR_I_FN(cartesian)
+KFR_I_FN(csqrt)
+
+/// @brief Returns the sine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 csin(const T1& x)
+{
+ return intrinsics::csin(x);
+}
+
+/// @brief Returns template expression that returns the sine of the the complex value x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::csin, E1> csin(E1&& x)
+{
+ return { fn::csin(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic sine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 csinh(const T1& x)
+{
+ return intrinsics::csinh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic sine of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::csinh, E1> csinh(E1&& x)
+{
+ return { fn::csinh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the cosine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 ccos(const T1& x)
+{
+ return intrinsics::ccos(x);
+}
+
+/// @brief Returns template expression that returns the cosine of the the complex value x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::ccos, E1> ccos(E1&& x)
+{
+ return { fn::ccos(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cosine of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 ccosh(const T1& x)
+{
+ return intrinsics::ccosh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cosine of the the complex value x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::ccosh, E1> ccosh(E1&& x)
+{
+ return { fn::ccosh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the absolute value (magnitude) of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION realtype<T1> cabs(const T1& x)
+{
+ return intrinsics::cabs(x);
+}
+
+/// @brief Returns template expression that returns the absolute value (magnitude) of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cabs, E1> cabs(E1&& x)
+{
+ return { fn::cabs(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the phase angle (argument) of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION realtype<T1> carg(const T1& x)
+{
+ return intrinsics::carg(x);
+}
+
+/// @brief Returns template expression that returns the phase angle (argument) of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::carg, E1> carg(E1&& x)
+{
+ return { fn::carg(), std::forward<E1>(x) };
+}
+
+/// @brief Returns template expression that returns the complex conjugate of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cconj, E1> cconj(E1&& x)
+{
+ return { fn::cconj(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the natural logarithm of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 clog(const T1& x)
+{
+ return intrinsics::clog(x);
+}
+
+/// @brief Returns template expression that returns the natural logarithm of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::clog, E1> clog(E1&& x)
+{
+ return { fn::clog(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the binary (base-2) logarithm of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 clog2(const T1& x)
+{
+ return intrinsics::clog2(x);
+}
+
+/// @brief Returns template expression that returns the binary (base-2) logarithm of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::clog2, E1> clog2(E1&& x)
+{
+ return { fn::clog2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the common (base-10) logarithm of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 clog10(const T1& x)
+{
+ return intrinsics::clog10(x);
+}
+
+/// @brief Returns template expression that returns the common (base-10) logarithm of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::clog10, E1> clog10(E1&& x)
+{
+ return { fn::clog10(), std::forward<E1>(x) };
+}
+
+/// @brief Returns \f$e\f$ raised to the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cexp(const T1& x)
+{
+ return intrinsics::cexp(x);
+}
+
+/// @brief Returns template expression that returns \f$e\f$ raised to the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cexp, E1> cexp(E1&& x)
+{
+ return { fn::cexp(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 2 raised to the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cexp2(const T1& x)
+{
+ return intrinsics::cexp2(x);
+}
+
+/// @brief Returns template expression that returns 2 raised to the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cexp2, E1> cexp2(E1&& x)
+{
+ return { fn::cexp2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 10 raised to the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cexp10(const T1& x)
+{
+ return intrinsics::cexp10(x);
+}
+
+/// @brief Returns template expression that returns 10 raised to the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cexp10, E1> cexp10(E1&& x)
+{
+ return { fn::cexp10(), std::forward<E1>(x) };
+}
+
+/// @brief Converts complex number to polar
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 polar(const T1& x)
+{
+ return intrinsics::polar(x);
+}
+
+/// @brief Returns template expression that converts complex number to polar
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::polar, E1> polar(E1&& x)
+{
+ return { fn::polar(), std::forward<E1>(x) };
+}
+
+/// @brief Converts complex number to cartesian
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 cartesian(const T1& x)
+{
+ return intrinsics::cartesian(x);
+}
+
+/// @brief Returns template expression that converts complex number to cartesian
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cartesian, E1> cartesian(E1&& x)
+{
+ return { fn::cartesian(), std::forward<E1>(x) };
+}
+
+/// @brief Returns square root of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 csqrt(const T1& x)
+{
+ return intrinsics::csqrt(x);
+}
+
+/// @brief Returns template expression that returns square root of the complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::csqrt, E1> csqrt(E1&& x)
+{
+ return { fn::csqrt(), std::forward<E1>(x) };
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/gamma.hpp b/include/kfr/math/gamma.hpp
@@ -0,0 +1,63 @@
+/** @addtogroup other_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/gamma.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the approximate gamma function of an argument
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> gamma(const T1& x)
+{
+ return intrinsics::gamma(x);
+}
+
+/// @brief Creates expression that returns the approximate gamma function of an argument
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::gamma, E1> gamma(E1&& x)
+{
+ return { fn::gamma(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the approximate factorial of an argument
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> factorial_approx(const T1& x)
+{
+ return intrinsics::factorial_approx(x);
+}
+
+/// @brief Creates expression that returns the approximate factorial of an argument
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::factorial_approx, E1> factorial_approx(E1&& x)
+{
+ return { fn::factorial_approx(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/hyperbolic.hpp b/include/kfr/math/hyperbolic.hpp
@@ -0,0 +1,123 @@
+/** @addtogroup hyperbolic
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/hyperbolic.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the hyperbolic sine of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sinh(const T1& x)
+{
+ return intrinsics::sinh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic sine of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sinh, E1> sinh(E1&& x)
+{
+ return { fn::sinh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cosine of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cosh(const T1& x)
+{
+ return intrinsics::cosh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cosine of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cosh, E1> cosh(E1&& x)
+{
+ return { fn::cosh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic tangent of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> tanh(const T1& x)
+{
+ return intrinsics::tanh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic tangent of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::tanh, E1> tanh(E1&& x)
+{
+ return { fn::tanh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cotangent of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> coth(const T1& x)
+{
+ return intrinsics::coth(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cotangent of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::coth, E1> coth(E1&& x)
+{
+ return { fn::coth(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic sine of the even elements of the x and the hyperbolic cosine of the odd
+/// elements of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sinhcosh(const T1& x)
+{
+ return intrinsics::sinhcosh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic sine of the even elements of the x and the
+/// hyperbolic cosine of the odd elements of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sinhcosh, E1> sinhcosh(E1&& x)
+{
+ return { fn::sinhcosh(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the hyperbolic cosine of the even elements of the x and the hyperbolic sine of the odd
+/// elements of the x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> coshsinh(const T1& x)
+{
+ return intrinsics::coshsinh(x);
+}
+
+/// @brief Returns template expression that returns the hyperbolic cosine of the even elements of the x and
+/// the hyperbolic sine of the odd elements of the x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::coshsinh, E1> coshsinh(E1&& x)
+{
+ return { fn::coshsinh(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/abs.hpp b/include/kfr/math/impl/abs.hpp
@@ -0,0 +1,138 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/select.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSSE3 && defined KFR_NATIVE_INTRINSICS
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return x & avoid_odr_use(special_constants<T>::invhighbitmask());
+}
+
+KFR_INTRINSIC i64sse abs(const i64sse& x) CMT_NOEXCEPT
+{
+ const __m128i sh = _mm_srai_epi32(x.v, 31);
+ const __m128i msk = _mm_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+ return _mm_sub_epi64(_mm_xor_si128(x.v, msk), msk);
+}
+KFR_INTRINSIC i32sse abs(const i32sse& x) CMT_NOEXCEPT { return _mm_abs_epi32(x.v); }
+KFR_INTRINSIC i16sse abs(const i16sse& x) CMT_NOEXCEPT { return _mm_abs_epi16(x.v); }
+KFR_INTRINSIC i8sse abs(const i8sse& x) CMT_NOEXCEPT { return _mm_abs_epi8(x.v); }
+KFR_INTRINSIC u64sse abs(const u64sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32sse abs(const u32sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16sse abs(const u16sse& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8sse abs(const u8sse& x) CMT_NOEXCEPT { return x; }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC i64avx abs(const i64avx& x) CMT_NOEXCEPT
+{
+ const __m256i sh = _mm256_srai_epi32(x.v, 31);
+ const __m256i msk = _mm256_shuffle_epi32(sh, _MM_SHUFFLE(3, 3, 1, 1));
+ return _mm256_sub_epi64(_mm256_xor_si256(x.v, msk), msk);
+}
+KFR_INTRINSIC i32avx abs(const i32avx& x) CMT_NOEXCEPT { return _mm256_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx abs(const i16avx& x) CMT_NOEXCEPT { return _mm256_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx abs(const i8avx& x) CMT_NOEXCEPT { return _mm256_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx abs(const u64avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx abs(const u32avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx abs(const u16avx& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx abs(const u8avx& x) CMT_NOEXCEPT { return x; }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC i64avx512 abs(const i64avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi64(x.v); }
+KFR_INTRINSIC i32avx512 abs(const i32avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi32(x.v); }
+KFR_INTRINSIC i16avx512 abs(const i16avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi16(x.v); }
+KFR_INTRINSIC i8avx512 abs(const i8avx512& x) CMT_NOEXCEPT { return _mm512_abs_epi8(x.v); }
+KFR_INTRINSIC u64avx512 abs(const u64avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32avx512 abs(const u32avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16avx512 abs(const u16avx512& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u8avx512 abs(const u8avx512& x) CMT_NOEXCEPT { return x; }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(abs, !is_f_class<T>::value)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon abs(const i8neon& x) CMT_NOEXCEPT { return vabsq_s8(x.v); }
+KFR_INTRINSIC i16neon abs(const i16neon& x) CMT_NOEXCEPT { return vabsq_s16(x.v); }
+KFR_INTRINSIC i32neon abs(const i32neon& x) CMT_NOEXCEPT { return vabsq_s32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return vabsq_s64(x.v); }
+#else
+KFR_INTRINSIC i64neon abs(const i64neon& x) CMT_NOEXCEPT { return select(x >= 0, x, -x); }
+#endif
+
+KFR_INTRINSIC u8neon abs(const u8neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u16neon abs(const u16neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u32neon abs(const u32neon& x) CMT_NOEXCEPT { return x; }
+KFR_INTRINSIC u64neon abs(const u64neon& x) CMT_NOEXCEPT { return x; }
+
+KFR_INTRINSIC f32neon abs(const f32neon& x) CMT_NOEXCEPT { return vabsq_f32(x.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT { return vabsq_f64(x.v); }
+#else
+KFR_INTRINSIC f64neon abs(const f64neon& x) CMT_NOEXCEPT
+{
+ return x & avoid_odr_use(special_constants<f64>::invhighbitmask());
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_1(abs)
+
+#else
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return x & avoid_odr_use(special_constants<T>::invhighbitmask());
+}
+
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> abs(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return select(x >= T(0), x, -x);
+}
+#endif
+KFR_HANDLE_SCALAR(abs)
+} // namespace intrinsics
+
+KFR_I_FN(abs)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/asin_acos.hpp b/include/kfr/math/impl/asin_acos.hpp
@@ -0,0 +1,58 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/atan.hpp"
+#include "../../math/select.hpp"
+#include "../../math/sqrt.hpp"
+#include "../../simd/impl/function.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> asin(const vec<T, N>& x)
+{
+ const vec<Tout, N> xx = x;
+ return atan2(xx, sqrt(Tout(1) - xx * xx));
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> acos(const vec<T, N>& x)
+{
+ const vec<Tout, N> xx = x;
+ return atan2(sqrt(Tout(1) - xx * xx), xx);
+}
+KFR_HANDLE_SCALAR(asin)
+KFR_HANDLE_SCALAR(acos)
+} // namespace intrinsics
+KFR_I_FN(asin)
+KFR_I_FN(acos)
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/include/kfr/math/impl/atan.hpp b/include/kfr/math/impl/atan.hpp
@@ -0,0 +1,230 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "../../math/abs.hpp"
+#include "../../math/select.hpp"
+#include "../../math/sin_cos.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan2k(const vec<f32, N>& yy, const vec<f32, N>& xx)
+{
+ vec<f32, N> x = xx, y = yy;
+ vec<f32, N> s, t, u;
+ vec<i32, N> q;
+ q = select(x < 0, -2, 0);
+ x = select(x < 0, -x, x);
+ mask<i32, N> m;
+ m = y > x;
+ t = x;
+ x = select(m, y, x);
+ y = select(m, -t, y);
+ q = select(m, q + 1, q);
+ s = y / x;
+ t = s * s;
+ u = 0.00282363896258175373077393f;
+ u = fmadd(u, t, -0.0159569028764963150024414f);
+ u = fmadd(u, t, 0.0425049886107444763183594f);
+ u = fmadd(u, t, -0.0748900920152664184570312f);
+ u = fmadd(u, t, 0.106347933411598205566406f);
+ u = fmadd(u, t, -0.142027363181114196777344f);
+ u = fmadd(u, t, 0.199926957488059997558594f);
+ u = fmadd(u, t, -0.333331018686294555664062f);
+ t = u * t * s + s;
+ t = innercast<f32>(q) * 1.5707963267948966192313216916398f + t;
+ return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan2k(const vec<f64, N>& yy, const vec<f64, N>& xx)
+{
+ vec<f64, N> x = xx, y = yy;
+ vec<f64, N> s, t, u;
+ vec<i64, N> q;
+ q = select(x < 0, i64(-2), i64(0));
+ x = select(x < 0, -x, x);
+ mask<i64, N> m;
+ m = y > x;
+ t = x;
+ x = select(m, y, x);
+ y = select(m, -t, y);
+ q = select(m, q + i64(1), q);
+ s = y / x;
+ t = s * s;
+ u = -1.88796008463073496563746e-05;
+ u = fmadd(u, t, 0.000209850076645816976906797);
+ u = fmadd(u, t, -0.00110611831486672482563471);
+ u = fmadd(u, t, 0.00370026744188713119232403);
+ u = fmadd(u, t, -0.00889896195887655491740809);
+ u = fmadd(u, t, 0.016599329773529201970117);
+ u = fmadd(u, t, -0.0254517624932312641616861);
+ u = fmadd(u, t, 0.0337852580001353069993897);
+ u = fmadd(u, t, -0.0407629191276836500001934);
+ u = fmadd(u, t, 0.0466667150077840625632675);
+ u = fmadd(u, t, -0.0523674852303482457616113);
+ u = fmadd(u, t, 0.0587666392926673580854313);
+ u = fmadd(u, t, -0.0666573579361080525984562);
+ u = fmadd(u, t, 0.0769219538311769618355029);
+ u = fmadd(u, t, -0.090908995008245008229153);
+ u = fmadd(u, t, 0.111111105648261418443745);
+ u = fmadd(u, t, -0.14285714266771329383765);
+ u = fmadd(u, t, 0.199999999996591265594148);
+ u = fmadd(u, t, -0.333333333333311110369124);
+ t = u * t * s + s;
+ t = innercast<f64>(q) * 1.5707963267948966192313216916398 + t;
+ return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
+{
+ vec<f32, N> r = atan2k(abs(y), x);
+ constexpr f32 pi = 3.1415926535897932384626433832795f;
+ constexpr f32 pi_over_2 = 1.5707963267948966192313216916398f;
+ constexpr f32 pi_over_4 = 0.78539816339744830961566084581988f;
+ r = mulsign(r, x);
+ r = select(isinf(x) || x == 0.0f, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0f), r);
+ r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0f), r);
+ r = select(y == 0.0f, select(x < 0.f, pi, 0.f), r);
+ r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
+ return r;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
+{
+ vec<f64, N> r = atan2k(abs(y), x);
+ constexpr f64 pi = 3.1415926535897932384626433832795;
+ constexpr f64 pi_over_2 = 1.5707963267948966192313216916398;
+ constexpr f64 pi_over_4 = 0.78539816339744830961566084581988;
+ r = mulsign(r, x);
+ r = select(isinf(x) || x == 0.0, pi_over_2 - select(x.asmask(), mulsign(pi_over_2, x), 0.0), r);
+ r = select(isinf(y), pi_over_2 - select(x.asmask(), mulsign(pi_over_4, x), 0.0), r);
+ r = select(y == 0.0, select(x < 0., pi, 0.), r);
+ r = (isnan(x) || isnan(y)).asvec() | mulsign(r, y);
+ return r;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan(const vec<f32, N>& x)
+{
+ vec<f32, N> t, u;
+ vec<i32, N> q;
+ q = select(x < 0.f, 2, 0);
+ vec<f32, N> s = select(x < 0.f, -x, x);
+ q = select(s > 1.f, q | 1, q);
+ s = select(s > 1.f, 1.0f / s, s);
+ t = s * s;
+ u = 0.00282363896258175373077393f;
+ u = fmadd(u, t, -0.0159569028764963150024414f);
+ u = fmadd(u, t, 0.0425049886107444763183594f);
+ u = fmadd(u, t, -0.0748900920152664184570312f);
+ u = fmadd(u, t, 0.106347933411598205566406f);
+ u = fmadd(u, t, -0.142027363181114196777344f);
+ u = fmadd(u, t, 0.199926957488059997558594f);
+ u = fmadd(u, t, -0.333331018686294555664062f);
+ t = s + s * (t * u);
+ t = select((q & 1) != 0, 1.570796326794896557998982f - t, t);
+ t = select((q & 2) != 0, -t, t);
+ return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan(const vec<f64, N>& x)
+{
+ vec<f64, N> t, u;
+ vec<i64, N> q;
+ q = select(x < 0.0, i64(2), i64(0));
+ vec<f64, N> s = select(x < 0.0, -x, x);
+ q = select(s > 1.0, q | 1, q);
+ s = select(s > 1.0, 1.0 / s, s);
+ t = s * s;
+ u = -1.88796008463073496563746e-05;
+ u = fmadd(u, t, 0.000209850076645816976906797);
+ u = fmadd(u, t, -0.00110611831486672482563471);
+ u = fmadd(u, t, 0.00370026744188713119232403);
+ u = fmadd(u, t, -0.00889896195887655491740809);
+ u = fmadd(u, t, 0.016599329773529201970117);
+ u = fmadd(u, t, -0.0254517624932312641616861);
+ u = fmadd(u, t, 0.0337852580001353069993897);
+ u = fmadd(u, t, -0.0407629191276836500001934);
+ u = fmadd(u, t, 0.0466667150077840625632675);
+ u = fmadd(u, t, -0.0523674852303482457616113);
+ u = fmadd(u, t, 0.0587666392926673580854313);
+ u = fmadd(u, t, -0.0666573579361080525984562);
+ u = fmadd(u, t, 0.0769219538311769618355029);
+ u = fmadd(u, t, -0.090908995008245008229153);
+ u = fmadd(u, t, 0.111111105648261418443745);
+ u = fmadd(u, t, -0.14285714266771329383765);
+ u = fmadd(u, t, 0.199999999996591265594148);
+ u = fmadd(u, t, -0.333333333333311110369124);
+ t = s + s * (t * u);
+ t = select((q & 1) != 0, 1.570796326794896557998982 - t, t);
+ t = select((q & 2) != 0, -t, t);
+ return t;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atandeg(const vec<f32, N>& x)
+{
+ return atan(x) * c_radtodeg<f32>;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atandeg(const vec<f64, N>& x)
+{
+ return atan(x) * c_radtodeg<f64>;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> atan2deg(const vec<f32, N>& y, const vec<f32, N>& x)
+{
+ return atan2(y, x) * c_radtodeg<f32>;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> atan2deg(const vec<f64, N>& y, const vec<f64, N>& x)
+{
+ return atan2(y, x) * c_radtodeg<f64>;
+}
+
+KFR_HANDLE_SCALAR(atan)
+KFR_HANDLE_SCALAR(atan2)
+KFR_HANDLE_SCALAR(atandeg)
+KFR_HANDLE_SCALAR(atan2deg)
+} // namespace intrinsics
+KFR_I_FN(atan)
+KFR_I_FN(atandeg)
+KFR_I_FN(atan2)
+KFR_I_FN(atan2deg)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/clamp.hpp b/include/kfr/math/impl/clamp.hpp
@@ -0,0 +1,55 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/min_max.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T>
+KFR_INTRINSIC T clamp(const T& x, const T& lo, const T& hi)
+{
+ return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
+{
+ return max(min(x, hi), lo);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
+{
+ return max(min(x, hi), zerovector<T, N>());
+}
+} // namespace intrinsics
+KFR_I_FN(clamp)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/gamma.hpp b/include/kfr/math/impl/gamma.hpp
@@ -0,0 +1,71 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "../../math/log_exp.hpp"
+#include "../../simd/impl/function.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wc99-extensions")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <typename T>
+constexpr T gamma_precalc[] = {
+ 0x2.81b263fec4e08p+0, 0x3.07b4100e04448p+16, -0xa.a0da01d4d4e2p+16, 0xf.05ccb27bb9dbp+16,
+ -0xa.fa79616b7c6ep+16, 0x4.6dd6c10d4df5p+16, -0xf.a2304199eb4ap+12, 0x1.c21dd4aade3dp+12,
+ -0x1.62f981f01cf84p+8, 0x5.a937aa5c48d98p+0, -0x3.c640bf82e2104p-8, 0xc.914c540f959cp-24,
+};
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> gamma(const vec<T, N>& z)
+{
+ constexpr size_t Count = arraysize(gamma_precalc<T>);
+ vec<T, N> accm = gamma_precalc<T>[0];
+ CMT_LOOP_UNROLL
+ for (size_t k = 1; k < Count; k++)
+ accm += gamma_precalc<T>[k] / (z + innercast<utype<T>>(k));
+ accm *= exp(-(z + Count)) * pow(z + Count, z + 0.5);
+ return accm / z;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> factorial_approx(const vec<T, N>& x)
+{
+ return gamma(x + T(1));
+}
+KFR_HANDLE_SCALAR(gamma)
+KFR_HANDLE_SCALAR(factorial_approx)
+} // namespace intrinsics
+KFR_I_FN(gamma)
+KFR_I_FN(factorial_approx)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/math/impl/hyperbolic.hpp b/include/kfr/math/impl/hyperbolic.hpp
@@ -0,0 +1,99 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/log_exp.hpp"
+#include "../../math/min_max.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> sinh(const vec<T, N>& x)
+{
+ const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
+ return (exp(xx) - exp(-xx)) * Tout(0.5);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> cosh(const vec<T, N>& x)
+{
+ const vec<Tout, N> xx = static_cast<vec<Tout, N>>(x);
+ return (exp(xx) + exp(-xx)) * Tout(0.5);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> tanh(const vec<T, N>& x)
+{
+ const vec<Tout, N> a = exp(2 * x);
+ return (a - 1) / (a + 1);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> coth(const vec<T, N>& x)
+{
+ const vec<Tout, N> a = exp(2 * x);
+ return (a + 1) / (a - 1);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> sinhcosh(const vec<T, N>& x)
+{
+ const vec<Tout, N> a = exp(x);
+ const vec<Tout, N> b = exp(-x);
+ return subadd(a, b) * Tout(0.5);
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> coshsinh(const vec<T, N>& x)
+{
+ const vec<Tout, N> a = exp(x);
+ const vec<Tout, N> b = exp(-x);
+ return addsub(a, b) * Tout(0.5);
+}
+
+KFR_HANDLE_SCALAR_1_T(sinh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cosh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(tanh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(coth, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(sinhcosh, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(coshsinh, flt_type<T>)
+} // namespace intrinsics
+KFR_I_FN(sinh)
+KFR_I_FN(cosh)
+KFR_I_FN(tanh)
+KFR_I_FN(coth)
+KFR_I_FN(sinhcosh)
+KFR_I_FN(coshsinh)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/log_exp.hpp b/include/kfr/math/impl/log_exp.hpp
@@ -0,0 +1,335 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/clamp.hpp"
+#include "../../math/min_max.hpp"
+#include "../../math/round.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+#include "../../simd/shuffle.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <size_t N>
+KFR_INTRINSIC vec<i32, N> vilogbp1(const vec<f32, N>& d)
+{
+ mask<i32, N> m = d < 5.421010862427522E-20f;
+ vec<i32, N> q = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff;
+ q = select(m, q - (64 + 0x7e), q - 0x7e);
+ return q;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<i64, N> vilogbp1(const vec<f64, N>& d)
+{
+ mask<i64, N> m = d < 4.9090934652977266E-91;
+ vec<i64, N> q = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff;
+ q = select(m, q - (300 + 0x03fe), q - 0x03fe);
+ return q;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q)
+{
+ vec<i32, N> m = q >> 31;
+ m = (((m + q) >> 6) - m) << 4;
+ const vec<i32, N> qq = q - (m << 2);
+ m = clamp(m + 0x7f, vec<i32, N>(0xff));
+ vec<f32, N> u = pow4(bitcast<f32>(innercast<i32>(m) << 23));
+ return x * u * bitcast<f32>((innercast<i32>(qq + 0x7f)) << 23);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
+{
+ vec<i64, N> m = q >> 31;
+ m = (((m + q) >> 9) - m) << 7;
+ const vec<i64, N> qq = q - (m << 2);
+ m = clamp(m + 0x3ff, i64(0x7ff));
+ vec<f64, N> u = pow4(bitcast<f64>(innercast<i64>(m) << 52));
+ return x * u * bitcast<f64>((innercast<i64>(qq + 0x3ff)) << 52);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> logb(const vec<T, N>& x)
+{
+ return select(x == T(), -avoid_odr_use(c_infinity<T>), static_cast<vec<T, N>>(vilogbp1(x) - 1));
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> log(const vec<f32, N>& d)
+{
+ vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
+ vec<f32, N> m = vldexpk(d, -e);
+
+ vec<f32, N> x = (m - 1.0f) / (m + 1.0f);
+ vec<f32, N> x2 = x * x;
+
+ vec<f32, N> sp =
+ select(d < 0, avoid_odr_use(constants<f32>::qnan), avoid_odr_use(constants<f32>::neginfinity));
+
+ vec<f32, N> t = 0.2371599674224853515625f;
+ t = fmadd(t, x2, 0.285279005765914916992188f);
+ t = fmadd(t, x2, 0.400005519390106201171875f);
+ t = fmadd(t, x2, 0.666666567325592041015625f);
+ t = fmadd(t, x2, 2.0f);
+
+ x = x * t + c_log_2<f32> * innercast<f32>(e);
+ x = select(d > 0, x, sp);
+
+ return x;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> log(const vec<f64, N>& d)
+{
+ vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
+ vec<f64, N> m = vldexpk(d, -e);
+
+ vec<f64, N> x = (m - 1.0) / (m + 1.0);
+ vec<f64, N> x2 = x * x;
+
+ vec<f64, N> sp =
+ select(d < 0, avoid_odr_use(constants<f64>::qnan), avoid_odr_use(constants<f64>::neginfinity));
+
+ vec<f64, N> t = 0.148197055177935105296783;
+ t = fmadd(t, x2, 0.153108178020442575739679);
+ t = fmadd(t, x2, 0.181837339521549679055568);
+ t = fmadd(t, x2, 0.22222194152736701733275);
+ t = fmadd(t, x2, 0.285714288030134544449368);
+ t = fmadd(t, x2, 0.399999999989941956712869);
+ t = fmadd(t, x2, 0.666666666666685503450651);
+ t = fmadd(t, x2, 2);
+
+ x = x * t + avoid_odr_use(constants<f64>::log_2) * innercast<f64>(e);
+ x = select(d > 0, x, sp);
+
+ return x;
+}
+
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> log2(const vec<T, N>& x)
+{
+ return log(innercast<Tout>(x)) * avoid_odr_use(constants<Tout>::recip_log_2);
+}
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> log10(const vec<T, N>& x)
+{
+ return log(innercast<Tout>(x)) * avoid_odr_use(constants<Tout>::recip_log_10);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> exp(const vec<f32, N>& d)
+{
+ const f32 ln2_part1 = 0.6931457519f;
+ const f32 ln2_part2 = 1.4286067653e-6f;
+
+ vec<i32, N> q = innercast<i32>(floor(d * avoid_odr_use(constants<f32>::recip_log_2)));
+ vec<f32, N> s, u;
+
+ s = fmadd(innercast<f32>(q), -ln2_part1, d);
+ s = fmadd(innercast<f32>(q), -ln2_part2, s);
+
+ const f32 c2 = 0.4999999105930328369140625f;
+ const f32 c3 = 0.166668415069580078125f;
+ const f32 c4 = 4.16539050638675689697265625e-2f;
+ const f32 c5 = 8.378830738365650177001953125e-3f;
+ const f32 c6 = 1.304379315115511417388916015625e-3f;
+ const f32 c7 = 2.7555381529964506626129150390625e-4f;
+
+ u = c7;
+ u = fmadd(u, s, c6);
+ u = fmadd(u, s, c5);
+ u = fmadd(u, s, c4);
+ u = fmadd(u, s, c3);
+ u = fmadd(u, s, c2);
+
+ u = s * s * u + s + 1.0f;
+ u = vldexpk(u, q);
+
+ u = select(d == avoid_odr_use(constants<f32>::neginfinity), 0.f, u);
+
+ return u;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> exp(const vec<f64, N>& d)
+{
+ const f64 ln2_part1 = 0.69314717501401901245;
+ const f64 ln2_part2 = 5.545926273775592108e-009;
+
+ vec<i64, N> q = innercast<i64>(floor(d * avoid_odr_use(constants<f64>::recip_log_2)));
+ vec<f64, N> s, u;
+
+ s = fmadd(innercast<f64>(q), -ln2_part1, d);
+ s = fmadd(innercast<f64>(q), -ln2_part2, s);
+
+ const f64 c2 = 0.499999999999994948485237955537741072475910186767578;
+ const f64 c3 = 0.166666666667024204739888659787538927048444747924805;
+ const f64 c4 = 4.16666666578945840693215529881854308769106864929199e-2;
+ const f64 c5 = 8.3333334397461874404333670440792047884315252304077e-3;
+ const f64 c6 = 1.3888881489747750223179290074426717183087021112442e-3;
+ const f64 c7 = 1.9841587032493949419205414574918222569976933300495e-4;
+ const f64 c8 = 2.47929324077393282239802768662784160369483288377523e-5;
+ const f64 c9 = 2.77076037925831049422552981864598109496000688523054e-6;
+ const f64 c10 = 2.59589616274586264243611237120812340606335055781528e-7;
+ const f64 c11 = 3.43801438838789632454461529017381016259946591162588e-8;
+
+ u = c11;
+ u = fmadd(u, s, c10);
+ u = fmadd(u, s, c9);
+ u = fmadd(u, s, c8);
+ u = fmadd(u, s, c7);
+ u = fmadd(u, s, c6);
+ u = fmadd(u, s, c5);
+ u = fmadd(u, s, c4);
+ u = fmadd(u, s, c3);
+ u = fmadd(u, s, c2);
+
+ u = s * s * u + s + 1.0;
+ u = vldexpk(u, q);
+
+ u = select(d == avoid_odr_use(constants<f64>::neginfinity), 0.0, u);
+
+ return u;
+}
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> exp2(const vec<T, N>& x)
+{
+ return exp(x * avoid_odr_use(constants<Tout>::log_2));
+}
+template <typename T, size_t N, typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> exp10(const vec<T, N>& x)
+{
+ return exp(x * avoid_odr_use(constants<Tout>::log_10));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b)
+{
+ const vec<T, N> t = exp(b * log(abs(a)));
+ const mask<T, N> isint = floor(b) == b;
+ const mask<T, N> iseven = (innercast<itype<T>>(b) & 1) == 0;
+ return select(
+ a > T(), t,
+ select(a == T(), T(),
+ select(isint, select(iseven, t, -t), broadcast<N>(avoid_odr_use(constants<T>::qnan)))));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b)
+{
+ return exp(reciprocal(b) * log(x));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cbrt(const vec<T, N>& x)
+{
+ return pow<T, N>(x, T(0.333333333333333333333333333333333));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = flt_type<T>>
+KFR_INTRINSIC vec<Tout, N> cbrt(const vec<T, N>& x)
+{
+ return cbrt(innercast<Tout>(x));
+}
+
+KFR_HANDLE_SCALAR_1_T(exp, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(exp2, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(exp10, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(log, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(log2, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(log10, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(logb, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(pow, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(root, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cbrt, flt_type<T>)
+
+KFR_HANDLE_ARGS_T(exp, flt_type<T>)
+KFR_HANDLE_ARGS_T(exp2, flt_type<T>)
+KFR_HANDLE_ARGS_T(exp10, flt_type<T>)
+KFR_HANDLE_ARGS_T(log, flt_type<T>)
+KFR_HANDLE_ARGS_T(log2, flt_type<T>)
+KFR_HANDLE_ARGS_T(log10, flt_type<T>)
+KFR_HANDLE_ARGS_T(logb, flt_type<T>)
+KFR_HANDLE_ARGS_T(pow, flt_type<T>)
+KFR_HANDLE_ARGS_T(root, flt_type<T>)
+KFR_HANDLE_ARGS_T(cbrt, flt_type<T>)
+
+KFR_HANDLE_NOT_F_1(exp)
+KFR_HANDLE_NOT_F_1(log)
+KFR_HANDLE_NOT_F_1(logb)
+KFR_HANDLE_NOT_F_1(pow)
+KFR_HANDLE_NOT_F_1(root)
+KFR_HANDLE_NOT_F_1(cbrt)
+
+template <typename T1, typename T2>
+KFR_INTRINSIC flt_type<common_type<T1, T2>> logn(const T1& a, const T2& b)
+{
+ return log(a) / log(b);
+}
+
+template <typename T1, typename T2>
+KFR_INTRINSIC flt_type<common_type<T1, T2>> logm(const T1& a, const T2& b)
+{
+ return log(a) * b;
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& m, const T3& a)
+{
+ return exp(fmadd(x, m, a));
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& m, const T3& a)
+{
+ return fmadd(log(x), m, a);
+}
+} // namespace intrinsics
+KFR_I_FN(exp)
+KFR_I_FN(exp2)
+KFR_I_FN(exp10)
+KFR_I_FN(log)
+KFR_I_FN(log2)
+KFR_I_FN(log10)
+KFR_I_FN(logb)
+KFR_I_FN(logn)
+KFR_I_FN(logm)
+KFR_I_FN(exp_fmadd)
+KFR_I_FN(log_fmadd)
+KFR_I_FN(pow)
+KFR_I_FN(root)
+KFR_I_FN(cbrt)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/logical.hpp b/include/kfr/math/impl/logical.hpp
@@ -0,0 +1,278 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+#if defined CMT_ARCH_SSE41
+
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const u8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i8sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i16sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i32sse& x) { return !_mm_testz_si128(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i64sse& x) { return !_mm_testz_si128(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const u8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i8sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i16sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i32sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i64sse& x) { return _mm_testc_si128(x.v, allonesvector(x).v); }
+#endif
+
+#if defined CMT_ARCH_AVX
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const f32sse& x) { return !_mm_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const f64sse& x) { return !_mm_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const f32avx& x) { return !_mm256_testz_ps(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const f64avx& x) { return !_mm256_testz_pd(x.v, x.v); }
+
+KFR_INTRINSIC bool bittestany(const u8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const u64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i8avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i16avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i32avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+KFR_INTRINSIC bool bittestany(const i64avx& x) { return !_mm256_testz_si256(x.v, x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const f32sse& x) { return _mm_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const f64sse& x) { return _mm_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const f32avx& x) { return _mm256_testc_ps(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const f64avx& x) { return _mm256_testc_pd(x.v, allonesvector(x).v); }
+
+KFR_INTRINSIC bool bittestall(const u8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const u64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i8avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i16avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i32avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+KFR_INTRINSIC bool bittestall(const i64avx& x) { return _mm256_testc_si256(x.v, allonesvector(x).v); }
+
+#if defined CMT_ARCH_AVX512
+// horizontal OR
+KFR_INTRINSIC bool bittestany(const f32avx512& x) { return _mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const f64avx512& x) { return _mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestany(const u8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const u16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const u32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const u64avx512& x) { return _mm512_movepi64_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i8avx512& x) { return _mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i16avx512& x) { return _mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i32avx512& x) { return _mm512_movepi32_mask(x.v); }
+KFR_INTRINSIC bool bittestany(const i64avx512& x) { return _mm512_movepi64_mask(x.v); }
+
+// horizontal AND
+KFR_INTRINSIC bool bittestall(const f32avx512& x) { return !~_mm512_movepi32_mask(_mm512_castps_si512(x.v)); }
+KFR_INTRINSIC bool bittestall(const f64avx512& x) { return !~_mm512_movepi64_mask(_mm512_castpd_si512(x.v)); }
+KFR_INTRINSIC bool bittestall(const u8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const u16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const u32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const u64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const i8avx512& x) { return !~_mm512_movepi8_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const i16avx512& x) { return !~_mm512_movepi16_mask(x.v); }
+KFR_INTRINSIC bool bittestall(const i32avx512& x) { return !uint16_t(~_mm512_movepi32_mask(x.v)); }
+KFR_INTRINSIC bool bittestall(const i64avx512& x) { return !uint8_t(~_mm512_movepi64_mask(x.v)); }
+
+#endif
+
+#elif defined CMT_ARCH_SSE41
+KFR_INTRINSIC bool bittestany(const f32sse& x)
+{
+ return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
+}
+KFR_INTRINSIC bool bittestany(const f64sse& x)
+{
+ return !_mm_testz_si128(bitcast<u8>(x).v, bitcast<u8>(x).v);
+}
+KFR_INTRINSIC bool bittestall(const f32sse& x)
+{
+ return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
+}
+KFR_INTRINSIC bool bittestall(const f64sse& x)
+{
+ return _mm_testc_si128(bitcast<u8>(x).v, allonesvector(bitcast<u8>(x)).v);
+}
+#endif
+
+#if !defined CMT_ARCH_SSE41
+
+KFR_INTRINSIC bool bittestany(const f32sse& x) { return _mm_movemask_ps(x.v); }
+KFR_INTRINSIC bool bittestany(const f64sse& x) { return _mm_movemask_pd(x.v); }
+KFR_INTRINSIC bool bittestany(const u8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const u16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const u32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const u64sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i8sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i16sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i32sse& x) { return _mm_movemask_epi8(x.v); }
+KFR_INTRINSIC bool bittestany(const i64sse& x) { return _mm_movemask_epi8(x.v); }
+
+KFR_INTRINSIC bool bittestall(const f32sse& x) { return !_mm_movemask_ps((~x).v); }
+KFR_INTRINSIC bool bittestall(const f64sse& x) { return !_mm_movemask_pd((~x).v); }
+KFR_INTRINSIC bool bittestall(const u8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const u16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const u32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const u64sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i8sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i16sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i32sse& x) { return !_mm_movemask_epi8((~x).v); }
+KFR_INTRINSIC bool bittestall(const i64sse& x) { return !_mm_movemask_epi8((~x).v); }
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+ return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+ return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+ return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+ return bittestany(low(a)) || bittestany(high(a));
+}
+
+#elif CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC bool bittestall(const u32neon& a)
+{
+ const uint32x2_t tmp = vand_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+ return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
+}
+
+KFR_INTRINSIC bool bittestany(const u32neon& a)
+{
+ const uint32x2_t tmp = vorr_u32(vget_low_u32(a.v), vget_high_u32(a.v));
+ return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
+}
+KFR_INTRINSIC bool bittestany(const u8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const u16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const u64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const i8neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const i16neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const i64neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const f32neon& a) { return bittestany(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestany(const f64neon& a) { return bittestany(bitcast<u32>(a)); }
+
+KFR_INTRINSIC bool bittestall(const u8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const u16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const u64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const i8neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const i16neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const i64neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const f32neon& a) { return bittestall(bitcast<u32>(a)); }
+KFR_INTRINSIC bool bittestall(const f64neon& a) { return bittestall(bitcast<u32>(a)); }
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+ return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& a)
+{
+ return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T>)>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+ return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T>), typename = void>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& a)
+{
+ return bittestany(low(a)) || bittestany(high(a));
+}
+
+#else
+
+template <typename T, size_t N>
+KFR_INTRINSIC bitmask<N> getmask(const vec<T, N>& x)
+{
+ typename bitmask<N>::type val = 0;
+ for (size_t i = 0; i < N; i++)
+ {
+ val |= (ubitcast(x[i]) >> (typebits<T>::bits - 1)) << i;
+ }
+ return val;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& x)
+{
+ return getmask(x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return bittestany(x & y);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& x)
+{
+ return !getmask(~x).value;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return !bittestany(~x & y);
+}
+#endif
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/min_max.hpp b/include/kfr/math/impl/min_max.hpp
@@ -0,0 +1,236 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(x.v, y.v); }
+
+KFR_INTRINSIC f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(x.v, y.v); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(x.v, y.v); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 min(const f32avx512& x, const f32avx512& y) { return _mm512_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 min(const f64avx512& x, const f64avx512& y) { return _mm512_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 max(const f32avx512& x, const f32avx512& y) { return _mm512_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 max(const f64avx512& x, const f64avx512& y) { return _mm512_max_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8avx512 min(const u8avx512& x, const u8avx512& y) { return _mm512_min_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 min(const i16avx512& x, const i16avx512& y) { return _mm512_min_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 min(const i8avx512& x, const i8avx512& y) { return _mm512_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 min(const u16avx512& x, const u16avx512& y) { return _mm512_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 min(const i32avx512& x, const i32avx512& y) { return _mm512_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 min(const u32avx512& x, const u32avx512& y) { return _mm512_min_epu32(x.v, y.v); }
+KFR_INTRINSIC u8avx512 max(const u8avx512& x, const u8avx512& y) { return _mm512_max_epu8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 max(const i16avx512& x, const i16avx512& y) { return _mm512_max_epi16(x.v, y.v); }
+KFR_INTRINSIC i8avx512 max(const i8avx512& x, const i8avx512& y) { return _mm512_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 max(const u16avx512& x, const u16avx512& y) { return _mm512_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 max(const i32avx512& x, const i32avx512& y) { return _mm512_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx512 max(const u32avx512& x, const u32avx512& y) { return _mm512_max_epu32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 min(const i64avx512& x, const i64avx512& y) { return _mm512_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 min(const u64avx512& x, const u64avx512& y) { return _mm512_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 max(const i64avx512& x, const i64avx512& y) { return _mm512_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx512 max(const u64avx512& x, const u64avx512& y) { return _mm512_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(x.v, y.v); }
+
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(x.v, y.v); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(x.v, y.v); }
+#else
+KFR_INTRINSIC i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
+#endif
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(x.v, y.v); }
+#endif
+
+#if defined CMT_ARCH_SSE41
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(x.v, y.v); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(x.v, y.v); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(x.v, y.v); }
+#else
+KFR_INTRINSIC i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
+
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
+
+KFR_INTRINSIC i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(x.v, y.v); }
+KFR_INTRINSIC u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(x.v, y.v); }
+KFR_INTRINSIC i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(x.v, y.v); }
+KFR_INTRINSIC u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(x.v, y.v); }
+KFR_INTRINSIC i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(x.v, y.v); }
+KFR_INTRINSIC u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(x.v, y.v); }
+KFR_INTRINSIC i64neon max(const i64neon& x, const i64neon& y) { return select(x > y, x, y); }
+KFR_INTRINSIC u64neon max(const u64neon& x, const u64neon& y) { return select(x > y, x, y); }
+
+KFR_INTRINSIC f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(x.v, y.v); }
+KFR_INTRINSIC f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(x.v, y.v); }
+#if defined CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(x.v, y.v); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(x.v, y.v); }
+#else
+KFR_INTRINSIC f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
+KFR_INTRINSIC f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return select(x < y, x, y);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return select(x > y, x, y);
+}
+#endif
+
+template <typename T>
+KFR_INTRINSIC T min(initialvalue<T>)
+{
+ return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+ : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T max(initialvalue<T>)
+{
+ return std::numeric_limits<T>::has_infinity ? -std::numeric_limits<T>::infinity()
+ : std::numeric_limits<T>::min();
+}
+template <typename T>
+KFR_INTRINSIC T absmin(initialvalue<T>)
+{
+ return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+ : std::numeric_limits<T>::max();
+}
+template <typename T>
+KFR_INTRINSIC T absmax(initialvalue<T>)
+{
+ return 0;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return min(abs(x), abs(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return max(abs(x), abs(y));
+}
+
+KFR_HANDLE_SCALAR(min)
+KFR_HANDLE_SCALAR(max)
+KFR_HANDLE_SCALAR(absmin)
+KFR_HANDLE_SCALAR(absmax)
+} // namespace intrinsics
+KFR_I_FN(min)
+KFR_I_FN(max)
+KFR_I_FN(absmin)
+KFR_I_FN(absmax)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/modzerobessel.hpp b/include/kfr/math/impl/modzerobessel.hpp
@@ -0,0 +1,104 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/log_exp.hpp"
+#include "../../simd/impl/function.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+#if CMT_HAS_WARNING("-Wc99-extensions")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> modzerobessel(const vec<T, N>& x)
+{
+ constexpr static T bessel_coef[] = { T(0.25),
+ T(0.027777777777777776236),
+ T(0.0017361111111111110147),
+ T(6.9444444444444444384e-005),
+ T(1.9290123456790123911e-006),
+ T(3.9367598891408417495e-008),
+ T(6.1511873267825652335e-010),
+ T(7.5940584281266239246e-012),
+ T(7.5940584281266233693e-014),
+ T(6.2760813455591932909e-016),
+ T(4.3583898233049949985e-018),
+ T(2.5789288895295827557e-020),
+ T(1.3157800456783586208e-022),
+ T(5.8479113141260384983e-025),
+ T(2.2843403570804837884e-027),
+ T(7.904291893012054025e-030),
+ T(2.4395962632753252792e-032),
+ T(6.75788438580422547e-035),
+ T(1.689471096451056426e-037),
+ T(3.8310002187098784929e-040),
+ T(7.9152897080782616517e-043),
+ T(1.4962740468957016443e-045),
+ T(2.5976979980828152196e-048),
+ T(4.1563167969325041577e-051),
+ T(6.1483976285983795968e-054),
+ T(8.434015951438105991e-057),
+ T(1.0757673407446563809e-059),
+ T(1.2791526049282476926e-062),
+ T(1.4212806721424974034e-065),
+ T(1.4789601166935457918e-068),
+ T(1.4442969889585408123e-071),
+ T(1.3262598613026086927e-074),
+ T(1.1472836170437790782e-077),
+ T(9.3655805472961564331e-081),
+ T(7.2265282000741942594e-084),
+ T(5.2786911614858977913e-087),
+ T(3.6556032974279072401e-090),
+ T(2.4034209713529963119e-093),
+ T(1.5021381070956226783e-096) };
+
+ const vec<T, N> x_2 = x * 0.5;
+ const vec<T, N> x_2_sqr = x_2 * x_2;
+ vec<T, N> num = x_2_sqr;
+ vec<T, N> result;
+ result = 1 + x_2_sqr;
+
+ CMT_LOOP_UNROLL
+ for (size_t i = 0; i < (sizeof(T) == 4 ? 20 : 39); i++)
+ {
+ result = fmadd((num *= x_2_sqr), bessel_coef[i], result);
+ }
+ return result;
+}
+
+KFR_HANDLE_SCALAR(modzerobessel)
+} // namespace intrinsics
+KFR_I_FN(modzerobessel)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/math/impl/round.hpp b/include/kfr/math/impl/round.hpp
@@ -0,0 +1,282 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+#include "abs.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#define KFR_mm_trunc_ps(V) _mm_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ps(V) _mm_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_pd(V) _mm_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_pd(V) _mm_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_trunc_ss(V) _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_ss(V) \
+ _mm_round_ss(_mm_setzero_ps(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm_trunc_sd(V) _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm_roundnearest_sd(V) \
+ _mm_round_sd(_mm_setzero_pd(), (V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#define KFR_mm_floor_ss(V) _mm_floor_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_floor_sd(V) _mm_floor_sd(_mm_setzero_pd(), (V))
+#define KFR_mm_ceil_ss(V) _mm_ceil_ss(_mm_setzero_ps(), (V))
+#define KFR_mm_ceil_sd(V) _mm_ceil_sd(_mm_setzero_pd(), (V))
+
+#define KFR_mm256_trunc_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_ps(V) _mm256_round_ps((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#define KFR_mm256_trunc_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
+#define KFR_mm256_roundnearest_pd(V) _mm256_round_pd((V), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32sse floor(const f32sse& value) { return _mm_floor_ps(value.v); }
+KFR_INTRINSIC f32sse ceil(const f32sse& value) { return _mm_ceil_ps(value.v); }
+KFR_INTRINSIC f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(value.v); }
+KFR_INTRINSIC f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64sse floor(const f64sse& value) { return _mm_floor_pd(value.v); }
+KFR_INTRINSIC f64sse ceil(const f64sse& value) { return _mm_ceil_pd(value.v); }
+KFR_INTRINSIC f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(value.v); }
+KFR_INTRINSIC f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32sse fract(const f32sse& x) { return x - floor(x); }
+KFR_INTRINSIC f64sse fract(const f64sse& x) { return x - floor(x); }
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx floor(const f32avx& value) { return _mm256_floor_ps(value.v); }
+KFR_INTRINSIC f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(value.v); }
+KFR_INTRINSIC f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(value.v); }
+KFR_INTRINSIC f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(value.v); }
+KFR_INTRINSIC f64avx floor(const f64avx& value) { return _mm256_floor_pd(value.v); }
+KFR_INTRINSIC f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(value.v); }
+KFR_INTRINSIC f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(value.v); }
+KFR_INTRINSIC f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(value.v); }
+KFR_INTRINSIC f32avx fract(const f32avx& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx fract(const f64avx& x) { return x - floor(x); }
+
+#endif
+
+#if defined CMT_ARCH_AVX512
+
+KFR_INTRINSIC f32avx512 floor(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 ceil(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 trunc(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 round(const f32avx512& value)
+{
+ return _mm512_roundscale_ps(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 floor(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 ceil(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 trunc(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f64avx512 round(const f64avx512& value)
+{
+ return _mm512_roundscale_pd(value.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+KFR_INTRINSIC f32avx512 fract(const f32avx512& x) { return x - floor(x); }
+KFR_INTRINSIC f64avx512 fract(const f64avx512& x) { return x - floor(x); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(floor, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(ceil, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(round, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(trunc, is_f_class<T>::value)
+KFR_HANDLE_ALL_SIZES_1_IF(fract, is_f_class<T>::value)
+
+#else
+
+// fallback
+
+template <typename T>
+constexpr T fp_precision_limit = 4503599627370496.0;
+template <>
+constexpr f32 fp_precision_limit<f32> = 16777216.0f;
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> floor(const vec<f32, N>& x)
+{
+ vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+ return select(abs(x) >= fp_precision_limit<f32>, x, t - select(x < t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> floor(const vec<f64, N>& x)
+{
+ vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+ return select(abs(x) >= fp_precision_limit<f64>, x, t - select(x < t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> ceil(const vec<f32, N>& x)
+{
+ vec<f32, N> t = innercast<f32>(innercast<i32>(x));
+ return select(abs(x) >= fp_precision_limit<f32>, x, t + select(x > t, 1.f, 0.f));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> ceil(const vec<f64, N>& x)
+{
+ vec<f64, N> t = innercast<f64>(innercast<i64>(x));
+ return select(abs(x) >= fp_precision_limit<f64>, x, t + select(x > t, 1., 0.));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> round(const vec<f32, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f32>, x,
+ innercast<f32>(innercast<i32>(x + mulsign(broadcast<N>(0.5f), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> round(const vec<f64, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f64>, x,
+ innercast<f64>(innercast<i64>(x + mulsign(broadcast<N>(0.5), x))));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> trunc(const vec<f32, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f32>, x, innercast<f32>(innercast<i32>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> trunc(const vec<f64, N>& x)
+{
+ return select(abs(x) >= fp_precision_limit<f64>, x, innercast<f64>(innercast<i64>(x)));
+}
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> fract(const vec<f32, N>& x)
+{
+ return x - floor(x);
+}
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> fract(const vec<f64, N>& x)
+{
+ return x - floor(x);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> floor(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> ceil(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> trunc(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> round(const vec<T, N>& value)
+{
+ return value;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> fract(const vec<T, N>&)
+{
+ return T(0);
+}
+
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> ifloor(const vec<T, N>& value)
+{
+ return innercast<IT>(floor(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iceil(const vec<T, N>& value)
+{
+ return innercast<IT>(ceil(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> itrunc(const vec<T, N>& value)
+{
+ return innercast<IT>(trunc(value));
+}
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<IT, N> iround(const vec<T, N>& value)
+{
+ return innercast<IT>(round(value));
+}
+
+KFR_HANDLE_SCALAR(floor)
+KFR_HANDLE_SCALAR(ceil)
+KFR_HANDLE_SCALAR(round)
+KFR_HANDLE_SCALAR(trunc)
+KFR_HANDLE_SCALAR(fract)
+KFR_HANDLE_SCALAR(ifloor)
+KFR_HANDLE_SCALAR(iceil)
+KFR_HANDLE_SCALAR(iround)
+KFR_HANDLE_SCALAR(itrunc)
+} // namespace intrinsics
+KFR_I_FN(floor)
+KFR_I_FN(ceil)
+KFR_I_FN(round)
+KFR_I_FN(trunc)
+KFR_I_FN(fract)
+KFR_I_FN(ifloor)
+KFR_I_FN(iceil)
+KFR_I_FN(iround)
+KFR_I_FN(itrunc)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+#undef KFR_mm_trunc_ps
+#undef KFR_mm_roundnearest_ps
+#undef KFR_mm_trunc_pd
+#undef KFR_mm_roundnearest_pd
+#undef KFR_mm_trunc_ss
+#undef KFR_mm_roundnearest_ss
+#undef KFR_mm_trunc_sd
+#undef KFR_mm_roundnearest_sd
+#undef KFR_mm_floor_ss
+#undef KFR_mm_floor_sd
+#undef KFR_mm_ceil_ss
+#undef KFR_mm_ceil_sd
+#undef KFR_mm256_trunc_ps
+#undef KFR_mm256_roundnearest_ps
+#undef KFR_mm256_trunc_pd
+#undef KFR_mm256_roundnearest_pd
diff --git a/include/kfr/math/impl/saturation.hpp b/include/kfr/math/impl/saturation.hpp
@@ -0,0 +1,205 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/select.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+// Generic functions
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+ using UT = utype<T>;
+ constexpr size_t shift = typebits<UT>::bits - 1;
+ vec<UT, N> aa = bitcast<UT>(a);
+ vec<UT, N> bb = bitcast<UT>(b);
+ const vec<UT, N> sum = aa + bb;
+ aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+ return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= T(), a, bitcast<T>(sum));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ using UT = utype<T>;
+ constexpr size_t shift = typebits<UT>::bits - 1;
+ vec<UT, N> aa = bitcast<UT>(a);
+ vec<UT, N> bb = bitcast<UT>(b);
+ const vec<UT, N> diff = aa - bb;
+ aa = (aa >> shift) + static_cast<UT>(std::numeric_limits<T>::max());
+
+ return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < T(), a, bitcast<T>(diff));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
+{
+ const vec<T, N> t = allonesvector(a);
+ return select(a > t - b, t, a + b);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return select(a < b, zerovector(a), a - b);
+}
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(x.v, y.v); }
+
+KFR_INTRINSIC u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
+KFR_INTRINSIC u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
+
+KFR_INTRINSIC i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
+KFR_INTRINSIC u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC u8avx512 satadd(const u8avx512& x, const u8avx512& y) { return _mm512_adds_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satadd(const i8avx512& x, const i8avx512& y) { return _mm512_adds_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satadd(const u16avx512& x, const u16avx512& y) { return _mm512_adds_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satadd(const i16avx512& x, const i16avx512& y) { return _mm512_adds_epi16(x.v, y.v); }
+KFR_INTRINSIC u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm512_subs_epu8(x.v, y.v); }
+KFR_INTRINSIC i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(x.v, y.v); }
+KFR_INTRINSIC i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(x.v, y.v); }
+
+KFR_INTRINSIC i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
+KFR_INTRINSIC u32avx512 satadd(const u32avx512& a, const u32avx512& b)
+{
+ return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC u64avx512 satadd(const u64avx512& a, const u64avx512& b)
+{
+ return saturated_unsigned_add(a, b);
+}
+KFR_INTRINSIC i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
+KFR_INTRINSIC u32avx512 satsub(const u32avx512& a, const u32avx512& b)
+{
+ return saturated_unsigned_sub(a, b);
+}
+KFR_INTRINSIC u64avx512 satsub(const u64avx512& a, const u64avx512& b)
+{
+ return saturated_unsigned_sub(a, b);
+}
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(a.v, b.v); }
+
+KFR_INTRINSIC u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(x.v, y.v); }
+KFR_INTRINSIC i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(x.v, y.v); }
+KFR_INTRINSIC u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(x.v, y.v); }
+KFR_INTRINSIC i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(x.v, y.v); }
+KFR_INTRINSIC u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u32(a.v, b.v); }
+KFR_INTRINSIC i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_s32(a.v, b.v); }
+KFR_INTRINSIC u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_u64(a.v, b.v); }
+KFR_INTRINSIC i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s64(a.v, b.v); }
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
+#else
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_signed_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
+KFR_INTRINSIC vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_unsigned_add(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_signed_sub(a, b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
+KFR_INTRINSIC vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return saturated_unsigned_sub(a, b);
+}
+#endif
+KFR_HANDLE_SCALAR(satadd)
+KFR_HANDLE_SCALAR(satsub)
+} // namespace intrinsics
+KFR_I_FN(satadd)
+KFR_I_FN(satsub)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/select.hpp b/include/kfr/math/impl/select.hpp
@@ -0,0 +1,329 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC u8sse select(const u8sse& m, const u8sse& x, const u8sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16sse select(const u16sse& m, const u16sse& x, const u16sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32sse select(const u32sse& m, const u32sse& x, const u32sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64sse select(const u64sse& m, const u64sse& x, const u64sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8sse select(const i8sse& m, const i8sse& x, const i8sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16sse select(const i16sse& m, const i16sse& x, const i16sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32sse select(const i32sse& m, const i32sse& x, const i32sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64sse select(const i64sse& m, const i64sse& x, const i64sse& y)
+{
+ return _mm_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32sse select(const f32sse& m, const f32sse& x, const f32sse& y)
+{
+ return _mm_blendv_ps(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f64sse select(const f64sse& m, const f64sse& x, const f64sse& y)
+{
+ return _mm_blendv_pd(y.v, x.v, m.v);
+}
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f64avx select(const f64avx& m, const f64avx& x, const f64avx& y)
+{
+ return _mm256_blendv_pd(y.v, x.v, m.v);
+}
+KFR_INTRINSIC f32avx select(const f32avx& m, const f32avx& x, const f32avx& y)
+{
+ return _mm256_blendv_ps(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC u8avx select(const u8avx& m, const u8avx& x, const u8avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u16avx select(const u16avx& m, const u16avx& x, const u16avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u32avx select(const u32avx& m, const u32avx& x, const u32avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC u64avx select(const u64avx& m, const u64avx& x, const u64avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i8avx select(const i8avx& m, const i8avx& x, const i8avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i16avx select(const i16avx& m, const i16avx& x, const i16avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i32avx select(const i32avx& m, const i32avx& x, const i32avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+KFR_INTRINSIC i64avx select(const i64avx& m, const i64avx& x, const i64avx& y)
+{
+ return _mm256_blendv_epi8(y.v, x.v, m.v);
+}
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f64avx512 select(const f64avx512& m, const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_mask_blend_pd(_mm512_movepi64_mask(_mm512_castpd_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC f32avx512 select(const f32avx512& m, const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_mask_blend_ps(_mm512_movepi32_mask(_mm512_castps_si512(m.v)), y.v, x.v);
+}
+KFR_INTRINSIC u8avx512 select(const u8avx512& m, const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u16avx512 select(const u16avx512& m, const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u32avx512 select(const u32avx512& m, const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC u64avx512 select(const u64avx512& m, const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i8avx512 select(const i8avx512& m, const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_mask_blend_epi8(_mm512_movepi8_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i16avx512 select(const i16avx512& m, const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_mask_blend_epi16(_mm512_movepi16_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i32avx512 select(const i32avx512& m, const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_mask_blend_epi32(_mm512_movepi32_mask(m.v), y.v, x.v);
+}
+KFR_INTRINSIC i64avx512 select(const i64avx512& m, const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_mask_blend_epi64(_mm512_movepi64_mask(m.v), y.v, x.v);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+ .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ vec<T, N> r;
+ intrin(r, a, b, c, [](auto x, auto y, auto z) { return intrinsics::select(x, y, z); });
+ return r;
+ // return concat(select(low(a), low(b), low(c)), select(high(a), high(b), high(c)));
+ // return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), vec<T, Nout>(b), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const T& c)
+{
+ return concat2(select(a.h.low, b, c), select(a.h.high, b, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), vec<T, Nout>(c)).shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const T& c)
+{
+ return concat2(select(a.h.low, b.h.low, c), select(a.h.high, b.h.high, c));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(shufflevector(a, csizeseq<Nout>), vec<T, Nout>(b), c.shuffle(csizeseq<Nout>))
+ .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const T& b, const vec<T, N>& c)
+{
+ return concat2(select(a.h.low, b, c.h.low), select(a.h.high, b, c.h.high));
+}
+
+#elif defined CMT_ARCH_NEON && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32neon select(const f32neon& m, const f32neon& x, const f32neon& y)
+{
+ return vbslq_f32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i8neon select(const i8neon& m, const i8neon& x, const i8neon& y)
+{
+ return vbslq_s8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u8neon select(const u8neon& m, const u8neon& x, const u8neon& y)
+{
+ return vbslq_u8(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i16neon select(const i16neon& m, const i16neon& x, const i16neon& y)
+{
+ return vbslq_s16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u16neon select(const u16neon& m, const u16neon& x, const u16neon& y)
+{
+ return vbslq_u16(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i32neon select(const i32neon& m, const i32neon& x, const i32neon& y)
+{
+ return vbslq_s32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u32neon select(const u32neon& m, const u32neon& x, const u32neon& y)
+{
+ return vbslq_u32(m.v, x.v, y.v);
+}
+KFR_INTRINSIC i64neon select(const i64neon& m, const i64neon& x, const i64neon& y)
+{
+ return vbslq_s64(m.v, x.v, y.v);
+}
+KFR_INTRINSIC u64neon select(const u64neon& m, const u64neon& x, const u64neon& y)
+{
+ return vbslq_u64(m.v, x.v, y.v);
+}
+
+#ifdef CMT_ARCH_NEON64
+KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+{
+ return vbslq_f64(m.v, x.v, y.v);
+}
+#else
+KFR_INTRINSIC f64neon select(const f64neon& m, const f64neon& x, const f64neon& y)
+{
+ return y ^ ((x ^ y) & m);
+}
+#endif
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ constexpr size_t Nout = next_simd_width<T>(N);
+ return select(a.shuffle(csizeseq<Nout>), b.shuffle(csizeseq<Nout>), c.shuffle(csizeseq<Nout>))
+ .shuffle(csizeseq<N>);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T>), typename = void>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
+{
+ return concat2(select(a.h.low, b.h.low, c.h.low), select(a.h.high, b.h.high, c.h.high));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+{
+ return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+{
+ return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+{
+ return select(m, vec<T, N>(x), y);
+}
+
+#else
+
+// fallback
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+{
+ return y ^ ((x ^ y) & m);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const T& y)
+{
+ return select(m, vec<T, N>(x), vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const vec<T, N>& x, const T& y)
+{
+ return select(m, x, vec<T, N>(y));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> select(const vec<T, N>& m, const T& x, const vec<T, N>& y)
+{
+ return select(m, vec<T, N>(x), y);
+}
+#endif
+
+} // namespace intrinsics
+KFR_I_FN(select)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/sin_cos.hpp b/include/kfr/math/impl/sin_cos.hpp
@@ -0,0 +1,310 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/min_max.hpp"
+#include "../../math/round.hpp"
+#include "../../math/select.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+#include "../../simd/shuffle.hpp"
+
+#if CMT_HAS_WARNING("-Wc99-extensions")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0)
+{
+ return select(msk, a0, b0);
+}
+
+template <typename T, size_t N, typename... Ts>
+KFR_INTRINSIC vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0,
+ const T& a1, const T& b1, const Ts&... values)
+{
+ return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
+{
+ const vec<T, N> xabs = abs(x);
+ constexpr T div = constants<T>::fold_constant_div;
+ vec<T, N> y = floor(xabs / div);
+ quadrant = innercast<itype<T>>(innercast<int>(y - floor(y * T(1.0 / 16.0)) * T(16.0)));
+
+ const mask<T, N> msk = (quadrant & 1) != 0;
+ quadrant = kfr::select(msk, quadrant + 1, quadrant);
+ y = select(msk, y + T(1.0), y);
+ quadrant = quadrant & 7;
+
+ constexpr T hi = constants<T>::fold_constant_hi;
+ constexpr T rem1 = constants<T>::fold_constant_rem1;
+ constexpr T rem2 = constants<T>::fold_constant_rem2;
+ return (xabs - y * hi) - y * rem1 - y * rem2;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> fold_range(const vec<T, N>& x)
+{
+ vec<itype<T>, N> q;
+ return trig_fold(x, q);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask)
+{
+ constexpr f32 sin_c2 = CMT_FP(-0x2.aaaaacp-4f, -1.6666667163e-01f);
+ constexpr f32 sin_c4 = CMT_FP(0x2.222334p-8f, 8.3333970979e-03f);
+ constexpr f32 sin_c6 = CMT_FP(-0xd.0566ep-16f, -1.9868623349e-04f);
+ constexpr f32 sin_c8 = CMT_FP(0x3.64cc1cp-20f, 3.2365221614e-06f);
+ constexpr f32 sin_c10 = CMT_FP(-0x5.6c4a4p-24f, -3.2323646337e-07f);
+ constexpr f32 cos_c2 = CMT_FP(-0x8.p-4f, -5.0000000000e-01f);
+ constexpr f32 cos_c4 = CMT_FP(0xa.aaaabp-8f, 4.1666667908e-02f);
+ constexpr f32 cos_c6 = CMT_FP(-0x5.b05d48p-12f, -1.3888973044e-03f);
+ constexpr f32 cos_c8 = CMT_FP(0x1.a065f8p-16f, 2.4819273676e-05f);
+ constexpr f32 cos_c10 = CMT_FP(-0x4.cd156p-24f, -2.8616830150e-07f);
+
+ const vec<f32, N> x2 = folded * folded;
+
+ vec<f32, N> formula = trig_horner(x2, cosmask, 1.0f, 1.0f, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6,
+ cos_c8, sin_c8, cos_c10, sin_c10);
+
+ formula = select(cosmask, formula, formula * folded);
+ return formula;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask)
+{
+ constexpr f64 sin_c2 = CMT_FP(-0x2.aaaaaaaaaaaaap-4, -1.666666666666666574e-01);
+ constexpr f64 sin_c4 = CMT_FP(0x2.22222222220cep-8, 8.333333333333038315e-03);
+ constexpr f64 sin_c6 = CMT_FP(-0xd.00d00cffd6618p-16, -1.984126984092335463e-04);
+ constexpr f64 sin_c8 = CMT_FP(0x2.e3bc744fb879ep-20, 2.755731902164406591e-06);
+ constexpr f64 sin_c10 = CMT_FP(-0x6.b99034c1467a4p-28, -2.505204327429436704e-08);
+ constexpr f64 sin_c12 = CMT_FP(0xb.0711ea8fe8ee8p-36, 1.604729496525771112e-10);
+ constexpr f64 sin_c14 = CMT_FP(-0xb.7e010897e55dp-44, -6.532561241665605726e-13);
+ constexpr f64 sin_c16 = CMT_FP(-0xb.64eac07f1d6bp-48, -4.048035517573349688e-14);
+ constexpr f64 cos_c2 = CMT_FP(-0x8.p-4, -5.000000000000000000e-01);
+ constexpr f64 cos_c4 = CMT_FP(0xa.aaaaaaaaaaaa8p-8, 4.166666666666666435e-02);
+ constexpr f64 cos_c6 = CMT_FP(-0x5.b05b05b05ad28p-12, -1.388888888888844490e-03);
+ constexpr f64 cos_c8 = CMT_FP(0x1.a01a01a0022e6p-16, 2.480158730125666056e-05);
+ constexpr f64 cos_c10 = CMT_FP(-0x4.9f93ed845de2cp-24, -2.755731909937878141e-07);
+ constexpr f64 cos_c12 = CMT_FP(0x8.f76bc015abe48p-32, 2.087673146642573010e-09);
+ constexpr f64 cos_c14 = CMT_FP(-0xc.9bf2dbe00379p-40, -1.146797738558921387e-11);
+ constexpr f64 cos_c16 = CMT_FP(0xd.1232ac32f7258p-48, 4.643782497495272199e-14);
+
+ vec<f64, N> x2 = folded * folded;
+ vec<f64, N> formula =
+ trig_horner(x2, cosmask, 1.0, 1.0, cos_c2, sin_c2, cos_c4, sin_c4, cos_c6, sin_c6, cos_c8, sin_c8,
+ cos_c10, sin_c10, cos_c12, sin_c12, cos_c14, sin_c14, cos_c16, sin_c16);
+
+ formula = select(cosmask, formula, formula * folded);
+ return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
+KFR_INTRINSIC vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
+{
+ vec<itype<T>, N> quadrant;
+ vec<T, N> folded = trig_fold(x_full, quadrant);
+
+ mask<T, N> flip_sign =
+ kfr::select(cosmask, ((quadrant == 2) || (quadrant == 4)).asvec(), (quadrant >= 4).asvec()).asmask();
+
+ mask<T, N> usecos = (quadrant == 2) || (quadrant == 6);
+ usecos = usecos ^ cosmask;
+
+ vec<T, N> formula = trig_sincos(folded, usecos);
+
+ mask<T, N> negmask = x_full < T(0);
+
+ flip_sign = flip_sign ^ (negmask & ~cosmask);
+
+ formula = select(flip_sign, -formula, formula);
+ return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sin(const vec<T, N>& x)
+{
+ vec<itype<T>, N> quadrant;
+ vec<T, N> folded = trig_fold(x, quadrant);
+
+ mask<T, N> flip_sign = quadrant >= itype<T>(4);
+ mask<T, N> usecos = (quadrant == itype<T>(2)) || (quadrant == itype<T>(6));
+
+ vec<T, N> formula = trig_sincos(folded, usecos);
+
+ formula = select(flip_sign ^ mask<T, N>(x), -formula, formula);
+ return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> cos(const vec<T, N>& x)
+{
+ vec<itype<T>, N> quadrant;
+ vec<T, N> folded = trig_fold(x, quadrant);
+
+ mask<T, N> eq4 = (quadrant == 4);
+ mask<T, N> flip_sign = (quadrant == 2) || eq4;
+ mask<T, N> usecos = (quadrant == 0) || eq4;
+
+ vec<T, N> formula = trig_sincos(folded, usecos);
+
+ formula = select(flip_sign, -formula, formula);
+ return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> fastsin(const vec<T, N>& x)
+{
+ const vec<T, N> msk = broadcast<N>(special_constants<T>::highbitmask());
+
+ constexpr static T c2 = -0.16665853559970855712890625;
+ constexpr static T c4 = +8.31427983939647674560546875e-3;
+ constexpr static T c6 = -1.85423981747590005397796630859375e-4;
+
+ const vec<T, N> pi = c_pi<T>;
+
+ vec<T, N> xx = x - pi;
+ vec<T, N> y = abs(xx);
+ y = select(y > c_pi<T, 1, 2>, pi - y, y);
+ y = y ^ (msk & ~xx);
+
+ vec<T, N> y2 = y * y;
+ vec<T, N> formula = c6;
+ vec<T, N> y3 = y2 * y;
+ formula = fmadd(formula, y2, c4);
+ formula = fmadd(formula, y2, c2);
+ formula = formula * y3 + y;
+ return formula;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> fastcos(const vec<T, N>& x)
+{
+ x += c_pi<T, 1, 2>;
+ x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
+ return fastsin(x);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sincos(const vec<T, N>& x)
+{
+ return sincos_mask(x, internal::oddmask<T, N>());
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> cossin(const vec<T, N>& x)
+{
+ return sincos_mask(x, internal::evenmask<T, N>());
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sinc(const vec<T, N>& x)
+{
+ return select(abs(x) <= avoid_odr_use(constants<T>::epsilon), T(1), sin(x) / x);
+}
+
+KFR_HANDLE_SCALAR_1_T(sin, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cos, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(fastsin, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(fastcos, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(sincos, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(cossin, flt_type<T>)
+KFR_HANDLE_SCALAR_1_T(sinc, flt_type<T>)
+
+KFR_HANDLE_NOT_F_1(sin)
+KFR_HANDLE_NOT_F_1(cos)
+KFR_HANDLE_NOT_F_1(fastsin)
+KFR_HANDLE_NOT_F_1(fastcos)
+KFR_HANDLE_NOT_F_1(sincos)
+KFR_HANDLE_NOT_F_1(cossin)
+KFR_HANDLE_NOT_F_1(sinc)
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout sindeg(const T& x)
+{
+ return sin(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout cosdeg(const T& x)
+{
+ return cos(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout fastsindeg(const T& x)
+{
+ return fastsin(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout fastcosdeg(const T& x)
+{
+ return fastcos(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout sincosdeg(const T& x)
+{
+ return sincos(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+
+template <typename T, typename Tout = flt_type<T>>
+KFR_INTRINSIC Tout cossindeg(const T& x)
+{
+ return cossin(x * avoid_odr_use(constants<Tout>::degtorad));
+}
+} // namespace intrinsics
+
+KFR_I_FN(sin)
+KFR_I_FN(cos)
+KFR_I_FN(fastsin)
+KFR_I_FN(fastcos)
+KFR_I_FN(sincos)
+KFR_I_FN(cossin)
+
+KFR_I_FN(sindeg)
+KFR_I_FN(cosdeg)
+KFR_I_FN(fastsindeg)
+KFR_I_FN(fastcosdeg)
+KFR_I_FN(sincosdeg)
+KFR_I_FN(cossindeg)
+
+KFR_I_FN(sinc)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/sqrt.hpp b/include/kfr/math/impl/sqrt.hpp
@@ -0,0 +1,72 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../simd/impl/function.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC f32x1 sqrt(const f32x1& x) { return slice<0, 1>(f32x4(_mm_sqrt_ss(extend<4>(x).v))); }
+KFR_INTRINSIC f64x1 sqrt(const f64x1& x)
+{
+ return slice<0, 1>(f64x2(_mm_sqrt_sd(_mm_setzero_pd(), extend<2>(x).v)));
+}
+KFR_INTRINSIC f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(x.v); }
+KFR_INTRINSIC f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(x.v); }
+
+#if defined CMT_ARCH_AVX
+KFR_INTRINSIC f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(x.v); }
+KFR_INTRINSIC f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(x.v); }
+#endif
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 sqrt(const f32avx512& x) { return _mm512_sqrt_ps(x.v); }
+KFR_INTRINSIC f64avx512 sqrt(const f64avx512& x) { return _mm512_sqrt_pd(x.v); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1_IF(sqrt, is_f_class<T>::value)
+
+#else
+
+// fallback
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> sqrt(const vec<T, N>& x)
+{
+ return apply([](T x) { return std::sqrt(x); }, x);
+}
+#endif
+KFR_HANDLE_SCALAR_1_T(sqrt, flt_type<T>)
+
+KFR_HANDLE_NOT_F_1(sqrt)
+} // namespace intrinsics
+KFR_I_FN(sqrt)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/impl/tan.hpp b/include/kfr/math/impl/tan.hpp
@@ -0,0 +1,149 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../math/abs.hpp"
+#include "../../math/select.hpp"
+#include "../../math/sin_cos.hpp"
+#include "../../simd/constants.hpp"
+#include "../../simd/impl/function.hpp"
+#include "../../simd/operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, typename IT = itype<T>>
+KFR_INTRINSIC vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse)
+{
+ constexpr T pi_14 = c_pi<T, 1, 4>;
+
+ vec<T, N> y = abs(x_full);
+ vec<T, N> scaled = y / pi_14;
+
+ vec<T, N> k_real = floor(scaled);
+ vec<IT, N> k = innercast<IT>(k_real);
+
+ vec<T, N> x = y - k_real * pi_14;
+
+ mask<T, N> need_offset = (k & 1) != 0;
+ x = select(need_offset, x - pi_14, x);
+
+ vec<IT, N> k_mod4 = k & 3;
+ inverse = (k_mod4 == 1) || (k_mod4 == 2);
+ return x;
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f32, N> tan(const vec<f32, N>& x_full)
+{
+ mask<f32, N> inverse;
+ vec<i32, N> quad;
+ const vec<f32, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+ inverse = quad == 2 || quad == 6;
+
+ constexpr f32 tan_c2 = CMT_FP(0x5.555378p-4, 3.333315551280975342e-01);
+ constexpr f32 tan_c4 = CMT_FP(0x2.225bb8p-4, 1.333882510662078857e-01);
+ constexpr f32 tan_c6 = CMT_FP(0xd.ac3fep-8, 5.340956896543502808e-02);
+ constexpr f32 tan_c8 = CMT_FP(0x6.41644p-8, 2.443529665470123291e-02);
+ constexpr f32 tan_c10 = CMT_FP(0xc.bfe7ep-12, 3.112703096121549606e-03);
+ constexpr f32 tan_c12 = CMT_FP(0x2.6754dp-8, 9.389210492372512817e-03);
+
+ constexpr f32 cot_c2 = CMT_FP(-0x5.555558p-4, -3.333333432674407959e-01);
+ constexpr f32 cot_c4 = CMT_FP(-0x5.b0581p-8, -2.222204580903053284e-02);
+ constexpr f32 cot_c6 = CMT_FP(-0x8.ac5ccp-12, -2.117502503097057343e-03);
+ constexpr f32 cot_c8 = CMT_FP(-0xd.aaa01p-16, -2.085343148792162538e-04);
+ constexpr f32 cot_c10 = CMT_FP(-0x1.a9a9b4p-16, -2.537148611736483872e-05);
+ constexpr f32 cot_c12 = CMT_FP(-0x6.f7d4dp-24, -4.153305894760705996e-07);
+
+ const vec<f32, N> x2 = x * x;
+ const vec<f32, N> val = trig_horner(x2, inverse, 1.0f, 1.0f, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6,
+ tan_c6, cot_c8, tan_c8, cot_c10, tan_c10, cot_c12, tan_c12);
+
+ const vec<f32, N> z = select(inverse, val / -x, val * x);
+ return mulsign(z, x_full);
+}
+
+template <size_t N>
+KFR_INTRINSIC vec<f64, N> tan(const vec<f64, N>& x_full)
+{
+ mask<f64, N> inverse;
+ vec<i64, N> quad;
+ const vec<f64, N> x = trig_fold(x_full, quad); // trig_fold_simple(x_full, inverse);
+ inverse = quad == 2 || quad == 6;
+
+ constexpr f64 tan_c2 = 0x1.5555555555a3cp-2;
+ constexpr f64 tan_c4 = 0x1.11111110c4068p-3;
+ constexpr f64 tan_c6 = 0x1.ba1ba1ef36a4dp-5;
+ constexpr f64 tan_c8 = 0x1.664f3f4af7ce2p-6;
+ constexpr f64 tan_c10 = 0x1.226f2682a2616p-7;
+ constexpr f64 tan_c12 = 0x1.d6b440e73f61dp-9;
+ constexpr f64 tan_c14 = 0x1.7f06cdd30bd39p-10;
+ constexpr f64 tan_c16 = 0x1.2a8fab895738ep-11;
+ constexpr f64 tan_c18 = 0x1.34ff88cfdc292p-12;
+ constexpr f64 tan_c20 = -0x1.b4165ea04339fp-18;
+ constexpr f64 tan_c22 = 0x1.5f93701d86962p-13;
+ constexpr f64 tan_c24 = -0x1.5a13a3cdfb8c1p-14;
+ constexpr f64 tan_c26 = 0x1.77c69cef3306cp-15;
+
+ constexpr f64 cot_c2 = -0x1.5555555555555p-2;
+ constexpr f64 cot_c4 = -0x1.6c16c16c16dcdp-6;
+ constexpr f64 cot_c6 = -0x1.1566abbff68a7p-9;
+ constexpr f64 cot_c8 = -0x1.bbd7794ef9999p-13;
+ constexpr f64 cot_c10 = -0x1.66a8ea1991906p-16;
+ constexpr f64 cot_c12 = -0x1.228220068711cp-19;
+ constexpr f64 cot_c14 = -0x1.d65ed2c45e21dp-23;
+ constexpr f64 cot_c16 = -0x1.897ead4a2f71dp-26;
+ constexpr f64 cot_c18 = -0x1.b592dc8656ec9p-31;
+ constexpr f64 cot_c20 = -0x1.3dc07078c46d6p-29;
+ constexpr f64 cot_c22 = 0x1.06c9e5c370edcp-29;
+ constexpr f64 cot_c24 = -0x1.217f50c9dbca3p-30;
+ constexpr f64 cot_c26 = 0x1.163ed8171a0c8p-32;
+
+ const vec<f64, N> x2 = x * x;
+ const vec<f64, N> val =
+ trig_horner(x2, inverse, 1.0, 1.0, cot_c2, tan_c2, cot_c4, tan_c4, cot_c6, tan_c6, cot_c8, tan_c8,
+ cot_c10, tan_c10, cot_c12, tan_c12, cot_c14, tan_c14, cot_c16, tan_c16, cot_c18, tan_c18,
+ cot_c20, tan_c20, cot_c22, tan_c22, cot_c24, tan_c24, cot_c26, tan_c26);
+
+ const vec<f64, N> z = select(inverse, val / -x, val * x);
+ return mulsign(z, x_full);
+}
+
+KFR_HANDLE_SCALAR_1_T(tan, flt_type<T>)
+KFR_HANDLE_NOT_F_1(tan)
+
+template <typename T>
+KFR_INTRINSIC flt_type<T> tandeg(const T& x)
+{
+ return tan(x * c_degtorad<flt_type<T>>);
+}
+} // namespace intrinsics
+KFR_I_FN(tan)
+KFR_I_FN(tandeg)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/interpolation.hpp b/include/kfr/math/interpolation.hpp
@@ -0,0 +1,74 @@
+/** @addtogroup interpolation
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "select.hpp"
+#include "sin_cos.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, typename M>
+KFR_FUNCTION T nearest(M mu, T x1, T x2)
+{
+ return select(mu < M(0.5), x1, x2);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T linear(M mu, T x1, T x2)
+{
+ return mix(mu, x1, x2);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T cosine(M mu, T x1, T x2)
+{
+ return mix((M(1) - fastcos(mu * c_pi<T>)) * M(0.5), x1, x2);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T cubic(M mu, T x0, T x1, T x2, T x3)
+{
+ const T a0 = x3 - x2 - x0 + x1;
+ const T a1 = x0 - x1 - a0;
+ const T a2 = x2 - x0;
+ const T a3 = x1;
+ return horner(mu, a0, a1, a2, a3);
+}
+
+template <typename T, typename M>
+KFR_FUNCTION T catmullrom(M mu, T x0, T x1, T x2, T x3)
+{
+ const T a0 = T(0.5) * (x3 - x0) - T(1.5) * (x2 - x1);
+ const T a1 = x0 - T(2.5) * x1 + T(2) * x2 - T(0.5) * x3;
+ const T a2 = T(0.5) * (x2 - x0);
+ const T a3 = x1;
+ return horner(mu, a0, a1, a2, a3);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/log_exp.hpp b/include/kfr/math/log_exp.hpp
@@ -0,0 +1,232 @@
+/** @addtogroup exponential
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/log_exp.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns e raised to the given power x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> exp(const T1& x)
+{
+ return intrinsics::exp(x);
+}
+
+/// @brief Returns e raised to the given power x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp, E1> exp(E1&& x)
+{
+ return { fn::exp(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 2 raised to the given power x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> exp2(const T1& x)
+{
+ return intrinsics::exp2(x);
+}
+
+/// @brief Returns 2 raised to the given power x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp2, E1> exp2(E1&& x)
+{
+ return { fn::exp2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns 10 raised to the given power x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> exp10(const T1& x)
+{
+ return intrinsics::exp10(x);
+}
+
+/// @brief Returns 10 raised to the given power x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp10, E1> exp10(E1&& x)
+{
+ return { fn::exp10(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the natural logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> log(const T1& x)
+{
+ return intrinsics::log(x);
+}
+
+/// @brief Returns the natural logarithm of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::log, E1> log(E1&& x)
+{
+ return { fn::log(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the binary (base-2) logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> log2(const T1& x)
+{
+ return intrinsics::log2(x);
+}
+
+/// @brief Returns the binary (base-2) logarithm of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::log2, E1> log2(E1&& x)
+{
+ return { fn::log2(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the common (base-10) logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> log10(const T1& x)
+{
+ return intrinsics::log10(x);
+}
+
+/// @brief Returns the common (base-10) logarithm of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::log10, E1> log10(E1&& x)
+{
+ return { fn::log10(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the rounded binary (base-2) logarithm of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> logb(const T1& x)
+{
+ return intrinsics::logb(x);
+}
+
+/// @brief Returns the rounded binary (base-2) logarithm of the x. Version that accepts and returns
+/// expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::logb, E1> logb(E1&& x)
+{
+ return { fn::logb(), std::forward<E1>(x) };
+}
+
+/// @brief Returns the logarithm of the x with base y.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> logn(const T1& x, const T2& y)
+{
+ return intrinsics::logn(x, y);
+}
+
+/// @brief Returns the logarithm of the x with base y. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::logn, E1, E2> logn(E1&& x, E2&& y)
+{
+ return { fn::logn(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns log(x) * y.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> logm(const T1& x, const T2& y)
+{
+ return intrinsics::logm(x, y);
+}
+
+/// @brief Returns log(x) * y. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::logm, E1, E2> logm(E1&& x, E2&& y)
+{
+ return { fn::logm(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns exp(x * m + a).
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2, T3>> exp_fmadd(const T1& x, const T2& y, const T3& z)
+{
+ return intrinsics::exp_fmadd(x, y, z);
+}
+
+/// @brief Returns exp(x * m + a). Accepts and returns expressions.
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_FUNCTION internal::expression_function<fn::exp_fmadd, E1, E2, E3> exp_fmadd(E1&& x, E2&& y, E3&& z)
+{
+ return { fn::exp_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
+}
+
+/// @brief Returns log(x) * m + a.
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2, T3>> log_fmadd(const T1& x, const T2& y, const T3& z)
+{
+ return intrinsics::log_fmadd(x, y, z);
+}
+
+/// @brief Returns log(x) * m + a. Accepts and returns expressions.
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_FUNCTION internal::expression_function<fn::log_fmadd, E1, E2, E3> log_fmadd(E1&& x, E2&& y, E3&& z)
+{
+ return { fn::log_fmadd(), std::forward<E1>(x), std::forward<E2>(y), std::forward<E3>(z) };
+}
+
+/// @brief Returns the x raised to the given power y.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> pow(const T1& x, const T2& y)
+{
+ return intrinsics::pow(x, y);
+}
+
+/// @brief Returns the x raised to the given power y. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::pow, E1, E2> pow(E1&& x, E2&& y)
+{
+ return { fn::pow(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns the real nth root of the x.
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value)>
+KFR_FUNCTION flt_type<common_type<T1, T2>> root(const T1& x, const T2& y)
+{
+ return intrinsics::root(x, y);
+}
+
+/// @brief Returns the real nth root of the x. Accepts and returns expressions.
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_FUNCTION internal::expression_function<fn::root, E1, E2> root(E1&& x, E2&& y)
+{
+ return { fn::root(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Returns the cube root of the x.
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cbrt(const T1& x)
+{
+ return intrinsics::cbrt(x);
+}
+
+/// @brief Returns the cube root of the x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cbrt, E1> cbrt(E1&& x)
+{
+ return { fn::cbrt(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/logical.hpp b/include/kfr/math/logical.hpp
@@ -0,0 +1,54 @@
+/** @addtogroup logical
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/logical.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns x[0] && x[1] && ... && x[N-1]
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC bool all(const mask<T, N>& x)
+{
+ return intrinsics::bittestall(x.asvec());
+}
+
+/**
+ * @brief Returns x[0] || x[1] || ... || x[N-1]
+ */
+template <typename T, size_t N>
+KFR_INTRINSIC bool any(const mask<T, N>& x)
+{
+ return intrinsics::bittestany(x.asvec());
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/min_max.hpp b/include/kfr/math/min_max.hpp
@@ -0,0 +1,111 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/min_max.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the smaller of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout min(const T1& x, const T2& y)
+{
+ return intrinsics::min(x, y);
+}
+
+/**
+ * @brief Returns the smaller of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::min, E1, E2> min(E1&& x, E2&& y)
+{
+ return { fn::min(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the greater of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout max(const T1& x, const T2& y)
+{
+ return intrinsics::max(x, y);
+}
+
+/**
+ * @brief Returns the greater of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::max, E1, E2> max(E1&& x, E2&& y)
+{
+ return { fn::max(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the smaller in magnitude of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout absmin(const T1& x, const T2& y)
+{
+ return intrinsics::absmin(x, y);
+}
+
+/**
+ * @brief Returns the smaller in magnitude of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::absmin, E1, E2> absmin(E1&& x, E2&& y)
+{
+ return { fn::absmin(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/**
+ * @brief Returns the greater in magnitude of two values.
+ */
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout absmax(const T1& x, const T2& y)
+{
+ return intrinsics::absmax(x, y);
+}
+
+/**
+ * @brief Returns the greater in magnitude of two values. Accepts and returns expressions.
+ */
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::absmax, E1, E2> absmax(E1&& x, E2&& y)
+{
+ return { fn::absmax(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/modzerobessel.hpp b/include/kfr/math/modzerobessel.hpp
@@ -0,0 +1,47 @@
+/** @addtogroup other_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/modzerobessel.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION T1 modzerobessel(const T1& x)
+{
+ return intrinsics::modzerobessel(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::modzerobessel, E1> modzerobessel(E1&& x)
+{
+ return { fn::modzerobessel(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/round.hpp b/include/kfr/math/round.hpp
@@ -0,0 +1,163 @@
+/** @addtogroup round
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/round.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Returns the largest integer value not greater than x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 floor(const T1& x)
+{
+ return intrinsics::floor(x);
+}
+
+/// @brief Returns the largest integer value not greater than x. Accepts and returns expressions.
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::floor, E1> floor(E1&& x)
+{
+ return { fn::floor(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 ceil(const T1& x)
+{
+ return intrinsics::ceil(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::ceil, E1> ceil(E1&& x)
+{
+ return { fn::ceil(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 round(const T1& x)
+{
+ return intrinsics::round(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::round, E1> round(E1&& x)
+{
+ return { fn::round(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 trunc(const T1& x)
+{
+ return intrinsics::trunc(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::trunc, E1> trunc(E1&& x)
+{
+ return { fn::trunc(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 fract(const T1& x)
+{
+ return intrinsics::fract(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::fract, E1> fract(E1&& x)
+{
+ return { fn::fract(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> ifloor(const T1& x)
+{
+ return intrinsics::ifloor(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::ifloor, E1> ifloor(E1&& x)
+{
+ return { fn::ifloor(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> iceil(const T1& x)
+{
+ return intrinsics::iceil(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::iceil, E1> iceil(E1&& x)
+{
+ return { fn::iceil(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> iround(const T1& x)
+{
+ return intrinsics::iround(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::iround, E1> iround(E1&& x)
+{
+ return { fn::iround(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC itype<T1> itrunc(const T1& x)
+{
+ return intrinsics::itrunc(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::itrunc, E1> itrunc(E1&& x)
+{
+ return { fn::itrunc(), std::forward<E1>(x) };
+}
+
+template <typename T, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC T fmod(const T& x, const T& y)
+{
+ return x - trunc(x / y) * y;
+}
+KFR_FN(fmod)
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+constexpr KFR_INTRINSIC vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x % y;
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return fmod(x, y);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/saturation.hpp b/include/kfr/math/saturation.hpp
@@ -0,0 +1,65 @@
+/** @addtogroup saturation
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/saturation.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Adds two arguments using saturation
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout satadd(const T1& x, const T2& y)
+{
+ return intrinsics::satadd(x, y);
+}
+
+/// @brief Creates an expression that adds two arguments using saturation
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::satadd, E1, E2> satadd(E1&& x, E2&& y)
+{
+ return { fn::satadd(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+/// @brief Subtracts two arguments using saturation
+template <typename T1, typename T2, KFR_ENABLE_IF(is_numeric_args<T1, T2>::value),
+ typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout satsub(const T1& x, const T2& y)
+{
+ return intrinsics::satsub(x, y);
+}
+
+/// @brief Creates an expression that subtracts two arguments using saturation
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::satsub, E1, E2> satsub(E1&& x, E2&& y)
+{
+ return { fn::satsub(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/select.hpp b/include/kfr/math/select.hpp
@@ -0,0 +1,59 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/select.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns x if m is true, otherwise return y. Order of the arguments is same as in ternary operator.
+ * @code
+ * return m ? x : y
+ * @endcode
+ */
+template <typename T1, size_t N, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value),
+ typename Tout = subtype<common_type<T2, T3>>>
+KFR_INTRINSIC vec<Tout, N> select(const mask<T1, N>& m, const T2& x, const T3& y)
+{
+ static_assert(sizeof(T1) == sizeof(Tout), "select: incompatible types");
+ return intrinsics::select(bitcast<Tout>(m.asvec()), innercast<Tout>(x), innercast<Tout>(y));
+}
+
+/**
+ * @brief Returns template expression that returns x if m is true, otherwise return y. Order of the arguments
+ * is same as in ternary operator.
+ */
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::select, E1, E2, E3> select(E1&& m, E2&& x, E3&& y)
+{
+ return { fn::select(), std::forward<E1>(m), std::forward<E2>(x), std::forward<E3>(y) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/sin_cos.hpp b/include/kfr/math/sin_cos.hpp
@@ -0,0 +1,318 @@
+/** @addtogroup trigonometric
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/sin_cos.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the trigonometric sine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sin(const T1& x)
+{
+ return intrinsics::sin(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sin, E1> sin(E1&& x)
+{
+ return { fn::sin(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cos(const T1& x)
+{
+ return intrinsics::cos(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cos, E1> cos(E1&& x)
+{
+ return { fn::cos(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastsin(const T1& x)
+{
+ return intrinsics::fastsin(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastsin, E1> fastsin(E1&& x)
+{
+ return { fn::fastsin(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastcos(const T1& x)
+{
+ return intrinsics::fastcos(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastcos, E1> fastcos(E1&& x)
+{
+ return { fn::fastcos(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
+ * be a vector.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sincos(const T1& x)
+{
+ return intrinsics::sincos(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and
+ * cosine of the odd elements. x must be a vector. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sincos, E1> sincos(E1&& x)
+{
+ return { fn::sincos(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
+ * be a vector.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cossin(const T1& x)
+{
+ return intrinsics::cossin(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and
+ * sine of the odd elements. x must be a vector. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cossin, E1> cossin(E1&& x)
+{
+ return { fn::cossin(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sindeg(const T1& x)
+{
+ return intrinsics::sindeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of the x (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sindeg, E1> sindeg(E1&& x)
+{
+ return { fn::sindeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cosdeg(const T1& x)
+{
+ return intrinsics::cosdeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the x (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cosdeg, E1> cosdeg(E1&& x)
+{
+ return { fn::cosdeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastsindeg(const T1& x)
+{
+ return intrinsics::fastsindeg(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric sine of the x
+ * (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastsindeg, E1> fastsindeg(E1&& x)
+{
+ return { fn::fastsindeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of the x (expressed in degrees).
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> fastcosdeg(const T1& x)
+{
+ return intrinsics::fastcosdeg(x);
+}
+
+/**
+ * @brief Returns an approximation of the trigonometric cosine of the x
+ * (expressed in degrees). Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::fastcosdeg, E1> fastcosdeg(E1&& x)
+{
+ return { fn::fastcosdeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and cosine of the odd elements. x must
+ * be a vector and expressed in degrees.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sincosdeg(const T1& x)
+{
+ return intrinsics::sincosdeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric sine of the even elements of the x and
+ * cosine of the odd elements. x must be expressed in degrees. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sincosdeg, E1> sincosdeg(E1&& x)
+{
+ return { fn::sincosdeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and sine of the odd elements. x must
+ * be a vector and expressed in degrees.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> cossindeg(const T1& x)
+{
+ return intrinsics::cossindeg(x);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the even elements of the x and
+ * sine of the odd elements. x must be expressed in degrees. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::cossindeg, E1> cossindeg(E1&& x)
+{
+ return { fn::cossindeg(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the sinc function of x.
+ * \f[
+ * sinc(x) = \frac{sin(x)}{x}
+ * \f]
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> sinc(const T1& x)
+{
+ return intrinsics::sinc(x);
+}
+
+/**
+ * @brief Returns the sinc function of x. Accepts and returns expressions.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::sinc, E1> sinc(E1&& x)
+{
+ return { fn::sinc(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns the trigonometric sine of the angle 2x using sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T sin2x(const T& sinx, const T& cosx)
+{
+ return 2 * sinx * cosx;
+}
+
+/**
+ * @brief Returns the trigonometric sine of the angle 3x using already computed sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T sin3x(const T& sinx, const T& cosx)
+{
+ return sinx * (-1 + 4 * sqr(cosx));
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the angle 2x using already computed sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T cos2x(const T& sinx, const T& cosx)
+{
+ return sqr(cosx) - sqr(sinx);
+}
+
+/**
+ * @brief Returns the trigonometric cosine of the angle 3x using already computed sin(x) and cos(x).
+ */
+template <typename T>
+KFR_INTRINSIC T cos3x(const T& sinx, const T& cosx)
+{
+ return cosx * (1 - 4 * sqr(sinx));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/sqrt.hpp b/include/kfr/math/sqrt.hpp
@@ -0,0 +1,53 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/sqrt.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/**
+ * @brief Returns the positive square root of the x. \f$\sqrt{x}\f$
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC flt_type<T1> sqrt(const T1& x)
+{
+ return intrinsics::sqrt(x);
+}
+
+/**
+ * @brief Returns template expression that returns the positive square root of the x. \f$\sqrt{x}\f$
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sqrt, E1> sqrt(E1&& x)
+{
+ return { fn::sqrt(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/tan.hpp b/include/kfr/math/tan.hpp
@@ -0,0 +1,59 @@
+/** @addtogroup trigonometric
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/tan.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> tan(const T1& x)
+{
+ return intrinsics::tan(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::tan, E1> tan(E1&& x)
+{
+ return { fn::tan(), std::forward<E1>(x) };
+}
+
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_FUNCTION flt_type<T1> tandeg(const T1& x)
+{
+ return intrinsics::tandeg(x);
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_FUNCTION internal::expression_function<fn::tandeg, E1> tandeg(E1&& x)
+{
+ return { fn::tandeg(), std::forward<E1>(x) };
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/runtime.hpp b/include/kfr/runtime.hpp
@@ -0,0 +1,26 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "runtime/cpuid.hpp"
+#include "runtime/cpuid_auto.hpp"
diff --git a/include/kfr/runtime/cpuid.hpp b/include/kfr/runtime/cpuid.hpp
@@ -0,0 +1,300 @@
+/** @addtogroup cpuid
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include "../simd/platform.hpp"
+#include "../simd/types.hpp"
+#include <cstring>
+
+namespace kfr
+{
+#ifdef CMT_ARCH_X86
+
+struct cpu_features
+{
+ u32 max;
+ u32 exmax;
+ u32 isIntel : 1;
+ u32 isAMD : 1;
+ u32 has3DNOW : 1;
+ u32 has3DNOWEXT : 1;
+ u32 hasABM : 1;
+ u32 hasADX : 1;
+ u32 hasAES : 1;
+ u32 hasAVX : 1;
+ u32 hasAVX2 : 1;
+ u32 hasAVXOSSUPPORT : 1;
+ u32 hasAVX512OSSUPPORT : 1;
+ u32 hasAVX512CD : 1;
+ u32 hasAVX512ER : 1;
+ u32 hasAVX512F : 1;
+ u32 hasAVX512DQ : 1;
+ u32 hasAVX512PF : 1;
+ u32 hasAVX512BW : 1;
+ u32 hasAVX512VL : 1;
+ u32 hasBMI1 : 1;
+ u32 hasBMI2 : 1;
+ u32 hasCLFSH : 1;
+ u32 hasCMOV : 1;
+ u32 hasCMPXCHG16B : 1;
+ u32 hasCX8 : 1;
+ u32 hasERMS : 1;
+ u32 hasF16C : 1;
+ u32 hasFMA : 1;
+ u32 hasFSGSBASE : 1;
+ u32 hasFXSR : 1;
+ u32 hasHLE : 1;
+ u32 hasINVPCID : 1;
+ u32 hasLAHF : 1;
+ u32 hasLZCNT : 1;
+ u32 hasMMX : 1;
+ u32 hasMMXEXT : 1;
+ u32 hasMONITOR : 1;
+ u32 hasMOVBE : 1;
+ u32 hasMSR : 1;
+ u32 hasOSXSAVE : 1;
+ u32 hasPCLMULQDQ : 1;
+ u32 hasPOPCNT : 1;
+ u32 hasPREFETCHWT1 : 1;
+ u32 hasRDRAND : 1;
+ u32 hasRDSEED : 1;
+ u32 hasRDTSCP : 1;
+ u32 hasRTM : 1;
+ u32 hasSEP : 1;
+ u32 hasSHA : 1;
+ u32 hasSSE : 1;
+ u32 hasSSE2 : 1;
+ u32 hasSSE3 : 1;
+ u32 hasSSE41 : 1;
+ u32 hasSSE42 : 1;
+ u32 hasSSE4a : 1;
+ u32 hasSSSE3 : 1;
+ u32 hasSYSCALL : 1;
+ u32 hasTBM : 1;
+ u32 hasXOP : 1;
+ u32 hasXSAVE : 1;
+ u32 padding1 : 6;
+ char vendor[17];
+ char model[49];
+ char padding2[2];
+};
+
+namespace internal_generic
+{
+
+struct cpu_data
+{
+ u32 data[4];
+};
+
+#if defined CMT_COMPILER_GNU || defined CMT_COMPILER_CLANG
+KFR_INTRINSIC u32 get_cpuid(u32 func, u32 subfunc, u32* eax, u32* ebx, u32* ecx, u32* edx)
+{
+ __asm__("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(func), "2"(subfunc));
+ return 1;
+}
+KFR_INTRINSIC void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
+{
+ get_cpuid(func, subfunc, &ptr[0], &ptr[1], &ptr[2], &ptr[3]);
+}
+KFR_INTRINSIC u32 get_xcr0()
+{
+ u32 xcr0;
+ __asm__ __volatile__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
+ return xcr0;
+}
+#elif defined CMT_COMPILER_MSVC
+
+KFR_INTRINSIC void cpuid(u32* ptr, u32 func, u32 subfunc = 0)
+{
+ __cpuidex((int*)ptr, (int)func, (int)subfunc);
+}
+KFR_INTRINSIC u32 get_xcr0()
+{
+#ifdef _XCR_XFEATURE_ENABLED_MASK
+ unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+ return (u32)Result;
+#else
+ return 0;
+#endif
+}
+#endif
+
+template <size_t = 0>
+cpu_t detect_cpu()
+{
+ cpu_features c;
+ memset(&c, 0, sizeof(c));
+ cpu_data data0;
+ cpu_data exdata0;
+
+ u32 f_1_ECX(0);
+ u32 f_1_EDX(0);
+ u32 f_7_EBX(0);
+ u32 f_7_ECX(0);
+ u32 f_81_ECX(0);
+ u32 f_81_EDX(0);
+
+ cpuid(data0.data, 0);
+ c.max = static_cast<u32>(data0.data[0]);
+ cpuid(exdata0.data, 0x80000000);
+ c.exmax = static_cast<u32>(exdata0.data[0]);
+
+ *ptr_cast<u32>(c.vendor) = static_cast<u32>(data0.data[1]);
+ *ptr_cast<u32>(c.vendor + 4) = static_cast<u32>(data0.data[3]);
+ *ptr_cast<u32>(c.vendor + 8) = static_cast<u32>(data0.data[2]);
+
+ c.isIntel = strncmp(c.vendor, "GenuineIntel", sizeof(c.vendor)) == 0 ? 1 : 0;
+ c.isAMD = strncmp(c.vendor, "AuthenticAMD", sizeof(c.vendor)) == 0 ? 1 : 0;
+
+ if (c.max >= 1)
+ {
+ cpu_data data1;
+ cpuid(data1.data, 1);
+ f_1_ECX = static_cast<u32>(data1.data[2]);
+ f_1_EDX = static_cast<u32>(data1.data[3]);
+ }
+
+ if (c.max >= 7)
+ {
+ cpu_data data7;
+ cpuid(data7.data, 7);
+ f_7_EBX = static_cast<u32>(data7.data[1]);
+ f_7_ECX = static_cast<u32>(data7.data[2]);
+ }
+
+ if (c.exmax >= 0x80000001)
+ {
+ cpu_data data81;
+ cpuid(data81.data, 0x80000001);
+ f_81_ECX = static_cast<u32>(data81.data[2]);
+ f_81_EDX = static_cast<u32>(data81.data[3]);
+ }
+
+ if (c.exmax >= 0x80000004)
+ {
+ cpu_data data82;
+ cpu_data data83;
+ cpu_data data84;
+ cpuid(data82.data, 0x80000002);
+ cpuid(data83.data, 0x80000003);
+ cpuid(data84.data, 0x80000004);
+ memcpy(c.model, data82.data, sizeof(cpu_data));
+ memcpy(c.model + 16, data83.data, sizeof(cpu_data));
+ memcpy(c.model + 32, data84.data, sizeof(cpu_data));
+ }
+
+ c.hasSSE3 = f_1_ECX >> 0 & 1;
+ c.hasPCLMULQDQ = f_1_ECX >> 1 & 1;
+ c.hasMONITOR = f_1_ECX >> 3 & 1;
+ c.hasSSSE3 = f_1_ECX >> 9 & 1;
+ c.hasFMA = f_1_ECX >> 12 & 1;
+ c.hasCMPXCHG16B = f_1_ECX >> 13 & 1;
+ c.hasSSE41 = f_1_ECX >> 19 & 1;
+ c.hasSSE42 = f_1_ECX >> 20 & 1;
+ c.hasMOVBE = f_1_ECX >> 22 & 1;
+ c.hasPOPCNT = f_1_ECX >> 23 & 1;
+ c.hasAES = f_1_ECX >> 25 & 1;
+ c.hasXSAVE = f_1_ECX >> 26 & 1;
+ c.hasOSXSAVE = f_1_ECX >> 27 & 1;
+ c.hasAVX = f_1_ECX >> 28 & 1;
+ c.hasF16C = f_1_ECX >> 29 & 1;
+ c.hasRDRAND = f_1_ECX >> 30 & 1;
+ c.hasMSR = f_1_EDX >> 5 & 1;
+ c.hasCX8 = f_1_EDX >> 8 & 1;
+ c.hasSEP = f_1_EDX >> 11 & 1;
+ c.hasCMOV = f_1_EDX >> 15 & 1;
+ c.hasCLFSH = f_1_EDX >> 19 & 1;
+ c.hasMMX = f_1_EDX >> 23 & 1;
+ c.hasFXSR = f_1_EDX >> 24 & 1;
+ c.hasSSE = f_1_EDX >> 25 & 1;
+ c.hasSSE2 = f_1_EDX >> 26 & 1;
+ c.hasFSGSBASE = f_7_EBX >> 0 & 1;
+ c.hasBMI1 = f_7_EBX >> 3 & 1;
+ c.hasHLE = c.isIntel && f_7_EBX >> 4 & 1;
+ c.hasAVX2 = f_7_EBX >> 5 & 1;
+ c.hasBMI2 = f_7_EBX >> 8 & 1;
+ c.hasERMS = f_7_EBX >> 9 & 1;
+ c.hasINVPCID = f_7_EBX >> 10 & 1;
+ c.hasRTM = c.isIntel && f_7_EBX >> 11 & 1;
+ c.hasAVX512F = f_7_EBX >> 16 & 1;
+ c.hasAVX512DQ = f_7_EBX >> 17 & 1;
+ c.hasRDSEED = f_7_EBX >> 18 & 1;
+ c.hasADX = f_7_EBX >> 19 & 1;
+ c.hasAVX512PF = f_7_EBX >> 26 & 1;
+ c.hasAVX512ER = f_7_EBX >> 27 & 1;
+ c.hasAVX512CD = f_7_EBX >> 28 & 1;
+ c.hasSHA = f_7_EBX >> 29 & 1;
+ c.hasAVX512BW = f_7_EBX >> 30 & 1;
+ c.hasAVX512VL = f_7_EBX >> 31 & 1;
+ c.hasPREFETCHWT1 = f_7_ECX >> 0 & 1;
+ c.hasLAHF = f_81_ECX >> 0 & 1;
+ c.hasLZCNT = c.isIntel && f_81_ECX >> 5 & 1;
+ c.hasABM = c.isAMD && f_81_ECX >> 5 & 1;
+ c.hasSSE4a = c.isAMD && f_81_ECX >> 6 & 1;
+ c.hasXOP = c.isAMD && f_81_ECX >> 11 & 1;
+ c.hasTBM = c.isAMD && f_81_ECX >> 21 & 1;
+ c.hasSYSCALL = c.isIntel && f_81_EDX >> 11 & 1;
+ c.hasMMXEXT = c.isAMD && f_81_EDX >> 22 & 1;
+ c.hasRDTSCP = c.isIntel && f_81_EDX >> 27 & 1;
+ c.has3DNOWEXT = c.isAMD && f_81_EDX >> 30 & 1;
+ c.has3DNOW = c.isAMD && f_81_EDX >> 31 & 1;
+
+ c.hasAVXOSSUPPORT = c.hasAVX && c.hasOSXSAVE && (get_xcr0() & 0x06) == 0x06;
+ c.hasAVX512OSSUPPORT = c.hasAVXOSSUPPORT && c.hasAVX512F && c.hasOSXSAVE && (get_xcr0() & 0xE0) == 0xE0;
+
+ if (c.hasAVX512F && c.hasAVX512CD && c.hasAVX512VL && c.hasAVX512BW && c.hasAVX512DQ &&
+ c.hasAVX512OSSUPPORT)
+ return cpu_t::avx512;
+ if (c.hasAVX2 && c.hasAVXOSSUPPORT)
+ return cpu_t::avx2;
+ if (c.hasAVX && c.hasAVXOSSUPPORT)
+ return cpu_t::avx1;
+ if (c.hasSSE41)
+ return cpu_t::sse41;
+ if (c.hasSSSE3)
+ return cpu_t::ssse3;
+ if (c.hasSSE3)
+ return cpu_t::sse3;
+ if (c.hasSSE2)
+ return cpu_t::sse2;
+ return cpu_t::lowest;
+}
+} // namespace internal_generic
+#else
+
+template <size_t = 0>
+cpu_t detect_cpu()
+{
+ return cpu_t::native;
+}
+
+#endif
+} // namespace kfr
diff --git a/include/kfr/runtime/cpuid_auto.hpp b/include/kfr/runtime/cpuid_auto.hpp
@@ -0,0 +1,62 @@
+/** @addtogroup cpuid
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "cpuid.hpp"
+
+namespace kfr
+{
+
+namespace internal_generic
+{
+
+KFR_INTRINSIC cpu_t& cpu_v()
+{
+ static cpu_t v1 = cpu_t::native;
+ return v1;
+}
+
+KFR_INTRINSIC char init_cpu_v()
+{
+ cpu_v() = detect_cpu<0>();
+ return 0;
+}
+
+KFR_INTRINSIC char init_dummyvar()
+{
+ static char dummy = init_cpu_v();
+ return dummy;
+}
+
+static char dummyvar = init_dummyvar();
+} // namespace internal_generic
+
+/**
+ * @brief Returns cpu instruction set detected at runtime.
+ */
+KFR_FUNCTION cpu_t get_cpu() { return internal_generic::cpu_v(); }
+
+} // namespace kfr
diff --git a/include/kfr/simd.hpp b/include/kfr/simd.hpp
@@ -0,0 +1,36 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd/comparison.hpp"
+#include "simd/complex.hpp"
+#include "simd/constants.hpp"
+#include "simd/digitreverse.hpp"
+#include "simd/horizontal.hpp"
+#include "simd/mask.hpp"
+#include "simd/operators.hpp"
+#include "simd/platform.hpp"
+#include "simd/read_write.hpp"
+#include "simd/shuffle.hpp"
+#include "simd/types.hpp"
+#include "simd/vec.hpp"
diff --git a/include/kfr/simd/comparison.hpp b/include/kfr/simd/comparison.hpp
@@ -0,0 +1,152 @@
+/** @addtogroup logical
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "constants.hpp"
+#include "impl/function.hpp"
+#include "vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> equal(const T1& x, const T2& y)
+{
+ return x == y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> notequal(const T1& x, const T2& y)
+{
+ return x != y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> less(const T1& x, const T2& y)
+{
+ return x < y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> greater(const T1& x, const T2& y)
+{
+ return x > y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> lessorequal(const T1& x, const T2& y)
+{
+ return x <= y;
+}
+template <typename T1, typename T2>
+inline maskfor<common_type<T1, T2>> greaterorequal(const T1& x, const T2& y)
+{
+ return x >= y;
+}
+KFR_FN(equal)
+KFR_FN(notequal)
+KFR_FN(less)
+KFR_FN(greater)
+KFR_FN(lessorequal)
+KFR_FN(greaterorequal)
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::equal, E1, E2> operator==(E1&& e1, E2&& e2)
+{
+ return { fn::equal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::notequal, E1, E2> operator!=(E1&& e1, E2&& e2)
+{
+ return { fn::notequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::less, E1, E2> operator<(E1&& e1, E2&& e2)
+{
+ return { fn::less(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::greater, E1, E2> operator>(E1&& e1, E2&& e2)
+{
+ return { fn::greater(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::lessorequal, E1, E2> operator<=(E1&& e1, E2&& e2)
+{
+ return { fn::lessorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::greaterorequal, E1, E2> operator>=(E1&& e1, E2&& e2)
+{
+ return { fn::greaterorequal(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isnan(const vec<T, N>& x)
+{
+ return x != x;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isinf(const vec<T, N>& x)
+{
+ return x == avoid_odr_use(constants<T>::infinity) || x == -constants<T>::infinity;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isfinite(const vec<T, N>& x)
+{
+ return !isnan(x) && !isinf(x);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> isnegative(const vec<T, N>& x)
+{
+ return (x & constants<T>::highbitmask()) != 0;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> ispositive(const vec<T, N>& x)
+{
+ return !isnegative(x);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> iszero(const vec<T, N>& x)
+{
+ return x == T();
+}
+
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC maskfor<common_type<T1, T2, T3>> inrange(const T1& x, const T2& min, const T3& max)
+{
+ return x >= min && x <= max;
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/complex.hpp b/include/kfr/simd/complex.hpp
@@ -0,0 +1,468 @@
+/** @addtogroup complex
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "constants.hpp"
+#include "impl/function.hpp"
+#include "operators.hpp"
+
+#ifdef KFR_STD_COMPLEX
+#include <complex>
+#endif
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+#ifdef KFR_STD_COMPLEX
+
+template <typename T>
+using complex = std::complex<T>;
+
+#else
+#ifndef KFR_CUSTOM_COMPLEX
+
+/**
+ * @brief Represents the complex numbers. If KFR_STD_COMPLEX is defined, then kfr::complex is an alias for
+ * std::complex.
+ */
+template <typename T>
+struct complex
+{
+ static_assert(is_simd_type<T>::value, "Incorrect type for complex");
+ constexpr static bool is_pod = true;
+ constexpr complex() CMT_NOEXCEPT = default;
+ KFR_MEM_INTRINSIC constexpr complex(T re) CMT_NOEXCEPT : re(re), im(0) {}
+ KFR_MEM_INTRINSIC constexpr complex(T re, T im) CMT_NOEXCEPT : re(re), im(im) {}
+ constexpr complex(const complex&) CMT_NOEXCEPT = default;
+ constexpr complex(complex&&) CMT_NOEXCEPT = default;
+ template <typename U>
+ KFR_MEM_INTRINSIC constexpr complex(const complex<U>& other) CMT_NOEXCEPT : re(static_cast<T>(other.re)),
+ im(static_cast<T>(other.im))
+ {
+ }
+ template <typename U>
+ KFR_MEM_INTRINSIC constexpr complex(complex<U>&& other) CMT_NOEXCEPT : re(std::move(other.re)),
+ im(std::move(other.im))
+ {
+ }
+#ifdef CMT_COMPILER_GNU
+ constexpr complex& operator=(const complex&) CMT_NOEXCEPT = default;
+ constexpr complex& operator=(complex&&) CMT_NOEXCEPT = default;
+#else
+ complex& operator=(const complex&) = default;
+ complex& operator=(complex&&) = default;
+#endif
+ KFR_MEM_INTRINSIC constexpr const T& real() const CMT_NOEXCEPT { return re; }
+ KFR_MEM_INTRINSIC constexpr const T& imag() const CMT_NOEXCEPT { return im; }
+ KFR_MEM_INTRINSIC constexpr void real(T value) CMT_NOEXCEPT { re = value; }
+ KFR_MEM_INTRINSIC constexpr void imag(T value) CMT_NOEXCEPT { im = value; }
+ T re;
+ T im;
+
+ KFR_MEM_INTRINSIC friend complex operator+(const complex& x, const complex& y)
+ {
+ return (make_vector(x) + make_vector(y))[0];
+ }
+ KFR_MEM_INTRINSIC friend complex operator-(const complex& x, const complex& y)
+ {
+ return (make_vector(x) - make_vector(y))[0];
+ }
+ KFR_MEM_INTRINSIC friend complex operator*(const complex& x, const complex& y)
+ {
+ return (make_vector(x) * make_vector(y))[0];
+ }
+ KFR_MEM_INTRINSIC friend complex operator/(const complex& x, const complex& y)
+ {
+ return (make_vector(x) / make_vector(y))[0];
+ }
+
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator+(const complex& x, const U& y)
+ {
+ return static_cast<C>(x) + static_cast<C>(y);
+ }
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator-(const complex& x, const U& y)
+ {
+ return static_cast<C>(x) - static_cast<C>(y);
+ }
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator*(const complex& x, const U& y)
+ {
+ return static_cast<C>(x) * static_cast<C>(y);
+ }
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator/(const complex& x, const U& y)
+ {
+ return static_cast<C>(x) / static_cast<C>(y);
+ }
+
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator+(const U& x, const complex& y)
+ {
+ return static_cast<C>(x) + static_cast<C>(y);
+ }
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator-(const U& x, const complex& y)
+ {
+ return static_cast<C>(x) - static_cast<C>(y);
+ }
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator*(const U& x, const complex& y)
+ {
+ return static_cast<C>(x) * static_cast<C>(y);
+ }
+ template <typename U, KFR_ENABLE_IF(is_number<U>::value), typename C = common_type<complex, U>>
+ KFR_MEM_INTRINSIC friend C operator/(const U& x, const complex& y)
+ {
+ return static_cast<C>(x) / static_cast<C>(y);
+ }
+ KFR_MEM_INTRINSIC friend complex operator-(const complex& x) { return (-make_vector(x))[0]; }
+ KFR_MEM_INTRINSIC friend complex operator+(const complex& x) { return x; }
+};
+#endif
+#endif
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+namespace cometa
+{
+template <typename T>
+struct compound_type_traits<kfr::complex<T>>
+{
+ constexpr static size_t width = 2;
+ constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+ using subtype = T;
+ using deep_subtype = cometa::deep_subtype<T>;
+ constexpr static bool is_scalar = false;
+ constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
+ template <typename U>
+ using rebind = kfr::complex<U>;
+ template <typename U>
+ using deep_rebind = kfr::complex<typename compound_type_traits<subtype>::template deep_rebind<U>>;
+
+ static constexpr subtype at(const kfr::complex<T>& value, size_t index)
+ {
+ return index == 0 ? value.real() : value.imag();
+ }
+};
+} // namespace cometa
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief Alias for complex<f32>
+using c32 = complex<f32>;
+
+/// @brief Alias for complex<f64>
+using c64 = complex<f64>;
+
+/// @brief Alias for complex<fbase>
+using cbase = complex<fbase>;
+
+namespace intrinsics
+{
+template <typename T>
+constexpr inline complex<T> vcomplex(const vec<T, 2>& v)
+{
+ return complex<T>(v.front(), v.back());
+}
+template <typename T>
+constexpr inline vec<T, 2> vcomplex(const complex<T>& v)
+{
+ return vec<T, 2>(v.real(), v.imag());
+}
+template <typename T>
+constexpr inline simd<T, 2> vvcomplex(const complex<T>& v)
+{
+ return intrinsics::simd_make(ctype<T>, v.real(), v.imag());
+}
+} // namespace intrinsics
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevector(const vec<complex<T>, N>& x,
+ csizes_t<indices...>) CMT_NOEXCEPT
+{
+ return intrinsics::simd_shuffle(intrinsics::simd_t<T, N>{}, x.v, scale<2, indices...>(), overload_auto);
+}
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<complex<T>, sizeof...(indices)> shufflevectors(const vec<complex<T>, N>& x,
+ const vec<T, N>& y,
+ csizes_t<indices...>) CMT_NOEXCEPT
+{
+ return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, scale<2, indices...>(),
+ overload_auto);
+}
+namespace internal
+{
+template <typename T>
+struct compoundcast<complex<T>>
+{
+ static vec<T, 2> to_flat(const complex<T>& x) { return { x.real(), x.imag() }; }
+ static complex<T> from_flat(const vec<T, 2>& x) { return { x.front(), x.back() }; }
+};
+
+template <typename T, size_t N>
+struct compoundcast<vec<complex<T>, N>>
+{
+ static vec<T, N * 2> to_flat(const vec<complex<T>, N>& x) { return x.flatten(); }
+ static vec<complex<T>, N / 2> from_flat(const vec<T, N>& x)
+ {
+ return vec<complex<T>, N / 2>::from_flatten(x);
+ }
+};
+} // namespace internal
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<complex<T>, N / 2> ccomp(const vec<T, N>& x)
+{
+ return vec<complex<T>, N / 2>::from_flatten(x);
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N * 2> cdecom(const vec<complex<T>, N>& x)
+{
+ return x.flatten();
+}
+
+/// @brief Returns vector of complex values with real part duplicated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cdupreal(const vec<complex<T>, N>& x)
+{
+ return ccomp(dupeven(cdecom(x)));
+}
+KFR_FN(cdupreal)
+
+/// @brief Returns vector of complex values with imaginary part duplicated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cdupimag(const vec<complex<T>, N>& x)
+{
+ return ccomp(dupodd(cdecom(x)));
+}
+KFR_FN(cdupimag)
+
+/// @brief Returns vector of complex values with real and imaginary parts swapped
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cswapreim(const vec<complex<T>, N>& x)
+{
+ return ccomp(swap<2>(cdecom(x)));
+}
+KFR_FN(cswapreim)
+
+/// @brief Returns vector of complex values with real part negated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cnegreal(const vec<complex<T>, N>& x)
+{
+ return x ^ complex<T>(-T(), T());
+}
+KFR_FN(cnegreal)
+
+/// @brief Returns vector of complex values with imaginary part negated
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cnegimag(const vec<complex<T>, N>& x)
+{
+ return x ^ complex<T>(T(), -T());
+}
+KFR_FN(cnegimag)
+
+namespace internal
+{
+template <typename T>
+struct is_complex_impl : std::false_type
+{
+};
+template <typename T>
+struct is_complex_impl<complex<T>> : std::true_type
+{
+};
+
+// vector<complex> to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<complex<From>, N>>
+{
+ static_assert(!is_compound<To>::value, "");
+ static_assert(!is_compound<From>::value, "");
+ static vec<complex<To>, N> cast(const vec<complex<From>, N>& value)
+ {
+ return vec<To, N * 2>(value.flatten()).v;
+ }
+};
+
+// vector to vector<complex>
+template <typename To, typename From, size_t N>
+struct conversion<vec<complex<To>, N>, vec<From, N>>
+{
+ static_assert(!is_compound<To>::value, "");
+ static_assert(!is_compound<From>::value, "");
+ static vec<complex<To>, N> cast(const vec<From, N>& value)
+ {
+ const vec<To, N> casted = static_cast<vec<To, N>>(value);
+ return interleave(casted, zerovector(casted)).v;
+ }
+};
+
+} // namespace internal
+
+/// @brief Returns the real part of the complex value
+template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)>
+constexpr KFR_INTRINSIC T real(const T& value)
+{
+ return value;
+}
+
+/// @brief Returns the real part of the complex value
+template <typename T>
+constexpr KFR_INTRINSIC T real(const complex<T>& value)
+{
+ return value.real();
+}
+
+/// @brief Returns the real part of the complex value
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> real(const vec<complex<T>, N>& value)
+{
+ return even(cdecom(value));
+}
+
+template <typename T>
+using realtype = decltype(kfr::real(std::declval<T>()));
+template <typename T>
+using realftype = ftype<decltype(kfr::real(std::declval<T>()))>;
+
+KFR_FN(real)
+
+/// @brief Returns the real part of the complex value
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::real, E1> real(E1&& x)
+{
+ return { {}, std::forward<E1>(x) };
+}
+
+/// @brief Returns the imaginary part of the complex value
+template <typename T>
+constexpr KFR_INTRINSIC T imag(const complex<T>& value)
+{
+ return value.imag();
+}
+
+/// @brief Returns the imaginary part of the complex value
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> imag(const vec<complex<T>, N>& value)
+{
+ return odd(cdecom(value));
+}
+KFR_FN(imag)
+
+/// @brief Returns the imaginary part of the complex value
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::imag, E1> imag(E1&& x)
+{
+ return { {}, std::forward<E1>(x) };
+}
+
+/// @brief Constructs complex value from real and imaginary parts
+template <typename T1, typename T2 = T1, size_t N, typename T = common_type<T1, T2>>
+constexpr KFR_INTRINSIC vec<complex<T>, N> make_complex(const vec<T1, N>& real,
+ const vec<T2, N>& imag = T2(0))
+{
+ return ccomp(interleave(innercast<T>(real), innercast<T>(imag)));
+}
+
+/// @brief Constructs complex value from real and imaginary parts
+template <typename T1, typename T2 = T1, typename T = common_type<T1, T2>>
+constexpr KFR_INTRINSIC complex<T> make_complex(T1 real, T2 imag = T2(0))
+{
+ return complex<T>(innercast<T>(real), innercast<T>(imag));
+}
+
+namespace intrinsics
+{
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> cconj(const vec<complex<T>, N>& x)
+{
+ return cnegimag(x);
+}
+} // namespace intrinsics
+
+/// @brief Returns the complex conjugate of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+KFR_INTRINSIC T1 cconj(const T1& x)
+{
+ return intrinsics::cconj(x);
+}
+
+template <size_t N>
+struct vec_of_complex
+{
+ template <typename T>
+ using type = vec<complex<T>, N>;
+};
+} // namespace CMT_ARCH_NAME
+
+template <typename T1, typename T2>
+struct common_type_impl<kfr::complex<T1>, kfr::complex<T2>> : common_type_from_subtypes<T1, T2, kfr::complex>
+{
+};
+template <typename T1, typename T2>
+struct common_type_impl<kfr::complex<T1>, T2> : common_type_from_subtypes<T1, T2, kfr::complex>
+{
+};
+template <typename T1, typename T2>
+struct common_type_impl<T1, kfr::complex<T2>> : common_type_from_subtypes<T1, T2, kfr::complex>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::complex<T1>, kfr::vec<kfr::complex<T2>, N>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<kfr::complex<T1>, N>, kfr::vec<kfr::complex<T2>, N>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<kfr::complex<T1>, N>, kfr::complex<T2>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::complex<T1>, kfr::vec<T2, N>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<T1, N>, kfr::complex<T2>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_of_complex<N>::template type>
+{
+};
+} // namespace kfr
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/simd/constants.hpp b/include/kfr/simd/constants.hpp
@@ -0,0 +1,160 @@
+/** @addtogroup constants
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "types.hpp"
+#include <limits>
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4309))
+CMT_PRAGMA_MSVC(warning(disable : 4146))
+
+namespace kfr
+{
+
+#if CMT_COMPILER_GNU
+constexpr double infinity = __builtin_inf();
+constexpr double qnan = __builtin_nan("");
+#else
+constexpr double infinity = HUGE_VAL;
+constexpr double qnan = NAN;
+#endif
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Woverflow")
+
+template <typename T>
+struct scalar_constants
+{
+ constexpr static T pi_s(int m, int d = 1) { return pi * m / d; }
+ constexpr static T recip_pi_s(int m, int d = 1) { return recip_pi * m / d; }
+
+ constexpr static T pi = static_cast<T>(3.1415926535897932384626433832795);
+ constexpr static T sqr_pi = static_cast<T>(9.8696044010893586188344909998762);
+ constexpr static T recip_pi = static_cast<T>(0.31830988618379067153776752674503);
+ constexpr static T degtorad = static_cast<T>(pi / 180);
+ constexpr static T radtodeg = static_cast<T>(pi * 180);
+ constexpr static T e = static_cast<T>(2.718281828459045235360287471352662);
+ constexpr static T recip_log_2 = static_cast<T>(1.442695040888963407359924681001892137426645954);
+ constexpr static T recip_log_10 = static_cast<T>(0.43429448190325182765112891891661);
+ constexpr static T log_2 = static_cast<T>(0.69314718055994530941723212145818);
+ constexpr static T log_10 = static_cast<T>(2.3025850929940456840179914546844);
+ constexpr static T sqrt_2 = static_cast<T>(1.4142135623730950488016887242097);
+
+ constexpr static T fold_constant_div = choose_const<T>(
+ CMT_FP(0x1.921fb6p-1f, 7.8539818525e-01f), CMT_FP(0x1.921fb54442d18p-1, 7.853981633974482790e-01));
+
+ constexpr static T fold_constant_hi = choose_const<T>(
+ CMT_FP(0x1.922000p-1f, 7.8540039062e-01f), CMT_FP(0x1.921fb40000000p-1, 7.853981256484985352e-01));
+ constexpr static T fold_constant_rem1 =
+ choose_const<T>(CMT_FP(-0x1.2ae000p-19f, -2.2267922759e-06f),
+ CMT_FP(0x1.4442d00000000p-25, 3.774894707930798177e-08));
+ constexpr static T fold_constant_rem2 =
+ choose_const<T>(CMT_FP(-0x1.de973ep-32f, -4.3527578764e-10f),
+ CMT_FP(0x1.8469898cc5170p-49, 2.695151429079059484e-15));
+
+ constexpr static T epsilon = std::numeric_limits<T>::epsilon();
+ constexpr static T infinity = std::numeric_limits<T>::infinity();
+ constexpr static T neginfinity = -std::numeric_limits<T>::infinity();
+ constexpr static T qnan = std::numeric_limits<T>::quiet_NaN();
+};
+
+template <typename T>
+struct constants : public scalar_constants<subtype<T>>
+{
+public:
+ using Tsub = subtype<T>;
+};
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+/// π (pi)
+/// c_pi<f64, 4> = 4pi
+/// c_pi<f64, 3, 4> = 3/4pi
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_pi = subtype<T>(3.1415926535897932384626433832795 * m / d);
+
+/// π² (pi²)
+/// c_sqr_pi<f64, 4> = 4pi²
+/// c_sqr_pi<f64, 3, 4> = 3/4pi²
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_sqr_pi = subtype<T>(9.8696044010893586188344909998762 * m / d);
+
+/// 1/Ï€ (1/pi)
+/// c_recip_pi<f64> 1/pi
+/// c_recip_pi<f64, 4> 4/pi
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_recip_pi = subtype<T>(0.31830988618379067153776752674503 * m / d);
+
+/// degree to radian conversion factor
+template <typename T>
+constexpr subtype<T> c_degtorad = c_pi<T, 1, 180>;
+
+/// radian to degree conversion factor
+template <typename T>
+constexpr subtype<T> c_radtodeg = c_recip_pi<T, 180>;
+
+/// e, Euler's number
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_e = subtype<T>(2.718281828459045235360287471352662 * m / d);
+
+template <typename T>
+constexpr unsigned c_mantissa_bits = sizeof(subtype<T>) == 32 ? 23 : 52;
+
+template <typename T>
+constexpr subtype<T> c_mantissa_mask = (subtype<T>(1) << c_mantissa_bits<T>)-1;
+
+template <typename T>
+constexpr subtype<T> c_epsilon = (std::numeric_limits<subtype<T>>::epsilon());
+
+/// infinity
+template <typename T>
+constexpr subtype<T> c_infinity = std::numeric_limits<subtype<T>>::infinity();
+
+/// -infinity
+template <typename T>
+constexpr subtype<T> c_neginfinity = -std::numeric_limits<subtype<T>>::infinity();
+
+/// Quiet NaN
+template <typename T>
+constexpr subtype<T> c_qnan = std::numeric_limits<subtype<T>>::quiet_NaN();
+
+template <typename T>
+constexpr subtype<T> c_recip_log_2 = subtype<T>(1.442695040888963407359924681001892137426645954);
+
+template <typename T>
+constexpr subtype<T> c_recip_log_10 = subtype<T>(0.43429448190325182765112891891661);
+
+template <typename T>
+constexpr subtype<T> c_log_2 = subtype<T>(0.69314718055994530941723212145818);
+
+template <typename T>
+constexpr subtype<T> c_log_10 = subtype<T>(2.3025850929940456840179914546844);
+
+template <typename T, int m = 1, int d = 1>
+constexpr subtype<T> c_sqrt_2 = subtype<T>(1.4142135623730950488016887242097 * m / d);
+} // namespace kfr
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/simd/digitreverse.hpp b/include/kfr/simd/digitreverse.hpp
@@ -0,0 +1,110 @@
+/** @addtogroup shuffle
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "shuffle.hpp"
+#include "types.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace internal
+{
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-overflow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshift-count-negative")
+
+constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32>) { return x; }
+
+template <u32 m, u32 shift, u32... values>
+constexpr inline u32 bit_permute_step_impl(u32 x, cvals_t<u32, m, shift, values...>)
+{
+ return bit_permute_step_impl(((x & m) << shift) | ((x >> shift) & m), cvals_t<u32, values...>());
+}
+
+template <size_t bits>
+constexpr inline u32 digitreverse_impl(u32 x, csize_t<2>)
+{
+ return bit_permute_step_impl(
+ x,
+ cvals_t<u32, 0x55555555, 1, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
+ (32 - bits);
+}
+
+template <size_t bits>
+constexpr inline u32 digitreverse_impl(u32 x, csize_t<4>)
+{
+ return bit_permute_step_impl(
+ x, cvals_t<u32, 0x33333333, 2, 0x0f0f0f0f, 4, 0x00ff00ff, 8, 0x0000ffff, 16>()) >>
+ (32 - bits);
+}
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+template <size_t radix, size_t bits>
+struct shuffle_index_digitreverse
+{
+ constexpr inline size_t operator()(size_t index) const CMT_NOEXCEPT
+ {
+ return digitreverse_impl<bits>(static_cast<u32>(index), csize_t<radix>());
+ }
+};
+} // namespace internal
+
+template <size_t radix, size_t group = 1, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> digitreverse(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(
+ csizeseq<N / group>.map(internal::shuffle_index_digitreverse<radix, ilog2(N / group)>())));
+}
+
+template <size_t groupsize = 1, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> bitreverse(const vec<T, N>& x)
+{
+ return digitreverse<2, groupsize>(x);
+}
+
+template <size_t groupsize = 1, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> digitreverse4(const vec<T, N>& x)
+{
+ return digitreverse<4, groupsize>(x);
+}
+
+template <size_t bits>
+constexpr inline u32 bitreverse(u32 x)
+{
+ return internal::digitreverse_impl<bits>(x, csize_t<2>());
+}
+
+template <size_t bits>
+constexpr inline u32 digitreverse4(u32 x)
+{
+ return internal::digitreverse_impl<bits>(x, csize_t<4>());
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/horizontal.hpp b/include/kfr/simd/horizontal.hpp
@@ -0,0 +1,138 @@
+/** @addtogroup horizontal
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "operators.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, typename ReduceFn>
+KFR_INTRINSIC T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
+{
+ return T(value.front());
+}
+
+template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))>
+KFR_INTRINSIC T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
+{
+ return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce));
+}
+template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))>
+KFR_INTRINSIC T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
+{
+ const T initial = reduce(initialvalue<T>());
+ return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce));
+}
+} // namespace intrinsics
+
+template <typename T, size_t N, typename ReduceFn>
+KFR_INTRINSIC T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
+{
+ return intrinsics::horizontal_impl(value, std::forward<ReduceFn>(reduce));
+}
+
+/// @brief Sum all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hadd(const vec<T, N>& value)
+{
+ return horizontal(value, fn::add());
+}
+KFR_FN(hadd)
+
+/// @brief Sum all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hsum(const vec<T, N>& value)
+{
+ return horizontal(value, fn::add());
+}
+KFR_FN(hsum)
+
+/// @brief Multiply all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hmul(const vec<T, N>& value)
+{
+ return horizontal(value, fn::mul());
+}
+KFR_FN(hmul)
+
+/// @brief Multiply all elements of the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hproduct(const vec<T, N>& value)
+{
+ return horizontal(value, fn::mul());
+}
+KFR_FN(hproduct)
+
+template <typename T, size_t N>
+KFR_INTRINSIC T hbitwiseand(const vec<T, N>& value)
+{
+ return horizontal(value, fn::bitwiseand());
+}
+KFR_FN(hbitwiseand)
+template <typename T, size_t N>
+KFR_INTRINSIC T hbitwiseor(const vec<T, N>& value)
+{
+ return horizontal(value, fn::bitwiseor());
+}
+KFR_FN(hbitwiseor)
+template <typename T, size_t N>
+KFR_INTRINSIC T hbitwisexor(const vec<T, N>& value)
+{
+ return horizontal(value, fn::bitwisexor());
+}
+KFR_FN(hbitwisexor)
+
+/// @brief Calculate the Dot-Product of two vectors
+template <typename T, size_t N>
+KFR_INTRINSIC T hdot(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return hadd(x * y);
+}
+KFR_FN(hdot)
+
+/// @brief Calculate the Arithmetic mean of all elements in the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T havg(const vec<T, N>& value)
+{
+ return hadd(value) / N;
+}
+KFR_FN(havg)
+
+/// @brief Calculate the RMS of all elements in the vector
+template <typename T, size_t N>
+KFR_INTRINSIC T hrms(const vec<T, N>& value)
+{
+ return builtin_sqrt(hadd(value * value) / N);
+}
+KFR_FN(hrms)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/backend.hpp b/include/kfr/simd/impl/backend.hpp
@@ -0,0 +1,79 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd.hpp"
+#ifdef CMT_CLANG_EXT
+#include "backend_clang.hpp"
+#else
+#include "backend_generic.hpp"
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#ifdef KFR_AUTOTESTS
+template <typename T>
+struct check_sizes
+{
+ static_assert(sizeof(simd<T, 1>) == sizeof(T), "");
+ static_assert(sizeof(simd<T, 2>) == sizeof(T) * 2, "");
+ static_assert(sizeof(simd<T, 3>) == sizeof(T) * 4, "");
+ static_assert(sizeof(simd<T, 4>) == sizeof(T) * 4, "");
+ static_assert(sizeof(simd<T, 5>) == sizeof(T) * 8, "");
+ static_assert(sizeof(simd<T, 6>) == sizeof(T) * 8, "");
+ static_assert(sizeof(simd<T, 7>) == sizeof(T) * 8, "");
+ static_assert(sizeof(simd<T, 8>) == sizeof(T) * 8, "");
+ static_assert(sizeof(simd<T, 16>) == sizeof(T) * 16, "");
+ static_assert(sizeof(simd<T, 32>) == sizeof(T) * 32, "");
+ static_assert(sizeof(simd<T, 64>) == sizeof(T) * 64, "");
+ static_assert(sizeof(simd<T, 128>) == sizeof(T) * 128, "");
+ static_assert(sizeof(simd<T, 256>) == sizeof(T) * 256, "");
+ static_assert(sizeof(simd<T, 512>) == sizeof(T) * 512, "");
+ static_assert(sizeof(simd<T, 513>) == sizeof(T) * 1024, "");
+ static_assert(sizeof(simd<T, 1023>) == sizeof(T) * 1024, "");
+ static_assert(sizeof(simd<T, 1024>) == sizeof(T) * 1024, "");
+};
+
+template struct check_sizes<float>;
+template struct check_sizes<double>;
+template struct check_sizes<uint8_t>;
+template struct check_sizes<uint16_t>;
+template struct check_sizes<uint32_t>;
+template struct check_sizes<uint64_t>;
+template struct check_sizes<int8_t>;
+template struct check_sizes<int16_t>;
+template struct check_sizes<int32_t>;
+template struct check_sizes<int64_t>;
+
+#endif
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+
+using CMT_ARCH_NAME::intrinsics::simd;
+} // namespace kfr
diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp
@@ -0,0 +1,228 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc99-extensions")
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename TT, size_t NN>
+using simd = TT __attribute__((ext_vector_type(NN)));
+
+template <typename T, size_t N1>
+KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x);
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+ const simd<T, Ns>&... z);
+
+template <typename Tout>
+KFR_INTRINSIC void simd_make(ctype_t<Tout>) = delete;
+
+template <typename Tout, typename Arg>
+KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg)
+{
+ return (simd<Tout, 1>){ static_cast<Tout>(arg) };
+}
+
+template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)>
+KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args)
+{
+ return (simd<Tout, N>){ static_cast<Tout>(args)... };
+}
+
+/// @brief Returns vector with undefined value
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_undefined()
+{
+ simd<Tout, N> x;
+ return x;
+}
+
+/// @brief Returns vector with all zeros
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_zeros()
+{
+ return Tout();
+}
+
+/// @brief Returns vector with all ones
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_allones()
+{
+ return special_constants<Tout>::allones();
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout))>
+KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x)
+{
+ return (simd<Tout, Nout>)x;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x)
+{
+ return x;
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>)
+{
+ return value[index];
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x)
+{
+ value[index] = x;
+ return value;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value)
+{
+ return value;
+}
+
+template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>,
+ overload_generic)
+{
+ return __builtin_shufflevector(x, x, (indices > N ? -1 : static_cast<int>(indices))...);
+}
+
+template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y,
+ csizes_t<indices...>, overload_generic)
+{
+ static_assert(N == N2, "");
+ return __builtin_shufflevector(x, y, (indices > 2 * N ? -1 : static_cast<int>(indices))...);
+}
+
+template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2),
+ size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y,
+ csizes_t<indices...>, overload_generic)
+{
+ constexpr size_t Nmax = (N1 > N2 ? N1 : N2);
+ return simd_shuffle(
+ simd2_t<T, Nmax, Nmax>{}, simd_shuffle(simd_t<T, N1>{}, x, csizeseq<Nmax>, overload_auto),
+ simd_shuffle(simd_t<T, N2>{}, y, csizeseq<Nmax>, overload_auto),
+ csizes<(indices < N1 ? indices : indices < N1 + N2 ? indices + (Nmax - N1) : index_undefined)...>,
+ overload_auto);
+}
+
+template <typename T, size_t N1>
+KFR_INTRINSIC simd<T, N1> simd_concat(const simd<T, N1>& x)
+{
+ return x;
+}
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+ const simd<T, Ns>&... z)
+{
+ return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...),
+ csizeseq<N1 + N2 + Nscount>, overload_auto);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x)
+{
+ return __builtin_convertvector(x, simd<Tout, N>);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x)
+{
+ return x;
+}
+
+template <typename T, size_t N, bool A>
+using simd_storage = struct_with_alignment<simd<T, N>, A>;
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src)
+{
+ return ptr_cast<simd_storage<T, N, A>>(src)->value;
+}
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src)
+{
+ constexpr size_t first = prev_poweroftwo(N);
+ constexpr size_t rest = N - first;
+ constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>);
+ constexpr auto concat_indices = cvalseq_t<size_t, N>();
+ return simd_shuffle(
+ simd2_t<T, first, first>{}, simd_read<first, A>(src),
+ simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
+ concat_indices, overload_auto);
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
+{
+ ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
+{
+ constexpr size_t first = prev_poweroftwo(N);
+ constexpr size_t rest = N - first;
+ simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq<first>, overload_auto));
+ simd_write<false, rest>(dest + first,
+ simd_shuffle(simd_t<T, N>{}, value, csizeseq<rest, first>, overload_auto));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index)
+{
+ return value[index];
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, size_t index, T x)
+{
+ value[index] = x;
+ return value;
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp
@@ -0,0 +1,1080 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "simd.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wuninitialized")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunknown-warning-option")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wmaybe-uninitialized")
+
+namespace kfr
+{
+
+#if KFR_SHOW_NOT_OPTIMIZED
+CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT;
+#else
+#define not_optimized(...) \
+ do \
+ { \
+ } while (0)
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+using simd = typename simd_type<T, N>::type;
+
+template <typename T, size_t N, typename U>
+union simd_small_array {
+ static_assert(sizeof(T) * N == sizeof(U), "");
+ T arr[N];
+ U whole;
+
+ KFR_INTRINSIC static constexpr simd_small_array from(U whole)
+ {
+ union {
+ const U w;
+ simd_small_array r;
+ } u{ whole };
+ return u.r;
+ }
+};
+
+#define KFR_SIMD_TYPE(T, N, ...) \
+ template <> \
+ struct simd_type<T, N> \
+ { \
+ using type = __VA_ARGS__; \
+ };
+
+#define KFR_SIMD_SMALL_TYPE(T, N, U) \
+ template <> \
+ struct simd_type<T, N> \
+ { \
+ using type = simd_small_array<T, N, U>; \
+ };
+
+template <typename T>
+struct simd_type<T, 1>
+{
+ using type = T;
+};
+
+template <typename T, size_t N>
+struct simd_type
+{
+ using type = simd_halves<T, N>;
+};
+
+KFR_SIMD_SMALL_TYPE(u8, 2, u16)
+KFR_SIMD_SMALL_TYPE(i8, 2, u16)
+
+KFR_SIMD_SMALL_TYPE(u8, 4, u32)
+KFR_SIMD_SMALL_TYPE(u16, 2, u32)
+KFR_SIMD_SMALL_TYPE(i8, 4, u32)
+KFR_SIMD_SMALL_TYPE(i16, 2, u32)
+
+KFR_SIMD_SMALL_TYPE(u8, 8, u64)
+KFR_SIMD_SMALL_TYPE(u16, 4, u64)
+KFR_SIMD_SMALL_TYPE(u32, 2, u64)
+KFR_SIMD_SMALL_TYPE(i8, 8, u64)
+KFR_SIMD_SMALL_TYPE(i16, 4, u64)
+KFR_SIMD_SMALL_TYPE(i32, 2, u64)
+
+KFR_SIMD_SMALL_TYPE(f32, 2, f64)
+
+#ifdef CMT_ARCH_SSE
+KFR_SIMD_TYPE(f32, 4, __m128)
+KFR_SIMD_TYPE(f64, 2, __m128d)
+#endif // CMT_ARCH_SSE
+
+#ifdef CMT_ARCH_SSE2
+KFR_SIMD_TYPE(u8, 16, __m128i)
+KFR_SIMD_TYPE(u16, 8, __m128i)
+KFR_SIMD_TYPE(u32, 4, __m128i)
+KFR_SIMD_TYPE(u64, 2, __m128i)
+KFR_SIMD_TYPE(i8, 16, __m128i)
+KFR_SIMD_TYPE(i16, 8, __m128i)
+KFR_SIMD_TYPE(i32, 4, __m128i)
+KFR_SIMD_TYPE(i64, 2, __m128i)
+#endif // CMT_ARCH_SSE2
+
+#ifdef CMT_ARCH_AVX
+KFR_SIMD_TYPE(float, 8, __m256)
+KFR_SIMD_TYPE(double, 4, __m256d)
+KFR_SIMD_TYPE(u8, 32, __m256i)
+KFR_SIMD_TYPE(u16, 16, __m256i)
+KFR_SIMD_TYPE(u32, 8, __m256i)
+KFR_SIMD_TYPE(u64, 4, __m256i)
+KFR_SIMD_TYPE(i8, 32, __m256i)
+KFR_SIMD_TYPE(i16, 16, __m256i)
+KFR_SIMD_TYPE(i32, 8, __m256i)
+KFR_SIMD_TYPE(i64, 4, __m256i)
+#endif // CMT_ARCH_AVX
+
+#ifdef CMT_ARCH_AVX512
+KFR_SIMD_TYPE(float, 16, __m512)
+KFR_SIMD_TYPE(double, 8, __m512d)
+KFR_SIMD_TYPE(u8, 64, __m512i)
+KFR_SIMD_TYPE(u16, 32, __m512i)
+KFR_SIMD_TYPE(u32, 16, __m512i)
+KFR_SIMD_TYPE(u64, 8, __m512i)
+KFR_SIMD_TYPE(i8, 64, __m512i)
+KFR_SIMD_TYPE(i16, 32, __m512i)
+KFR_SIMD_TYPE(i32, 16, __m512i)
+KFR_SIMD_TYPE(i64, 8, __m512i)
+#endif // CMT_ARCH_AVX512
+
+#ifdef CMT_ARCH_NEON
+KFR_SIMD_TYPE(u8, 16, uint8x16_t);
+KFR_SIMD_TYPE(u16, 8, uint16x8_t);
+KFR_SIMD_TYPE(u32, 4, uint32x4_t);
+KFR_SIMD_TYPE(u64, 2, uint64x2_t);
+KFR_SIMD_TYPE(i8, 16, int8x16_t);
+KFR_SIMD_TYPE(i16, 8, int16x8_t);
+KFR_SIMD_TYPE(i32, 4, int32x4_t);
+KFR_SIMD_TYPE(i64, 2, int64x2_t);
+KFR_SIMD_TYPE(f32, 4, float32x4_t);
+#ifdef CMT_ARCH_NEON64
+KFR_SIMD_TYPE(f64, 2, float64x2_t);
+#endif // CMT_ARCH_NEON64
+#endif // CMT_ARCH_NEON
+
+#if defined CMT_COMPILER_MSVC
+#define KFR_i8sse_INDEX(x, i) x.m128i_i8[i]
+#define KFR_i16sse_INDEX(x, i) x.m128i_i16[i]
+#define KFR_i32sse_INDEX(x, i) x.m128i_i32[i]
+#define KFR_i64sse_INDEX(x, i) x.m128i_i64[i]
+#define KFR_u8sse_INDEX(x, i) x.m128i_u8[i]
+#define KFR_u16sse_INDEX(x, i) x.m128i_u16[i]
+#define KFR_u32sse_INDEX(x, i) x.m128i_u32[i]
+#define KFR_u64sse_INDEX(x, i) x.m128i_u64[i]
+#define KFR_f32sse_INDEX(x, i) x.m128_f32[i]
+#define KFR_f64sse_INDEX(x, i) x.m128d_f64[i]
+#else
+#define KFR_i8sse_INDEX(x, i) bitcast_anything<simd_array<i8, 16>>(x).val[i]
+#define KFR_i16sse_INDEX(x, i) bitcast_anything<simd_array<i16, 8>>(x).val[i]
+#define KFR_i32sse_INDEX(x, i) _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 1, i)))
+#define KFR_i64sse_INDEX(x, i) _mm_cvtsi128_si64(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, (i)*2 + 1, i * 2)))
+#define KFR_u8sse_INDEX(x, i) bitcast_anything<simd_array<u8, 16>>(x).val[i]
+#define KFR_u16sse_INDEX(x, i) bitcast_anything<simd_array<u16, 8>>(x).val[i]
+#define KFR_u32sse_INDEX(x, i) _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 1, i)))
+#define KFR_u64sse_INDEX(x, i) _mm_cvtsi128_si64(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, (i)*2 + 1, i * 2)))
+#define KFR_f32sse_INDEX(x, i) _mm_cvtss_f32(_mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 2, 1, i)))
+#define KFR_f64sse_INDEX(x, i) _mm_cvtsd_f64(_mm_shuffle_pd(x, x, _MM_SHUFFLE2(1, i)))
+#endif
+
+// specializations
+
+#ifdef KFR_NATIVE_INTRINSICS
+
+#define KFR_GEN_ty(n, ty) ty(n)
+#define KFR_GEN_arg_def(n, ty) ty arg##n
+#define KFR_GEN_arg(n, ty) arg##n
+
+#define KFR_INTRIN_MAKE(n, ty, intrin) \
+ KFR_INTRINSIC simd<ty, n> simd_make(ctype_t<ty>, CMT_GEN_LIST(n, KFR_GEN_arg_def, ty)) CMT_NOEXCEPT \
+ { \
+ return intrin(CMT_GEN_LIST(n, KFR_GEN_arg, ty)); \
+ }
+
+#ifdef CMT_ARCH_SSE2
+inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); }
+KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x)
+KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x)
+KFR_INTRIN_MAKE(2, f64, _mm_setr_pd)
+KFR_INTRIN_MAKE(4, i32, _mm_setr_epi32)
+KFR_INTRIN_MAKE(4, u32, _mm_setr_epi32)
+KFR_INTRIN_MAKE(4, f32, _mm_setr_ps)
+KFR_INTRIN_MAKE(8, i16, _mm_setr_epi16)
+KFR_INTRIN_MAKE(8, u16, _mm_setr_epi16)
+KFR_INTRIN_MAKE(16, i8, _mm_setr_epi8)
+KFR_INTRIN_MAKE(16, u8, _mm_setr_epi8)
+
+#define KFR_INTRIN_BITCAST(Tout, Tin, N, ...) \
+ KFR_INTRINSIC simd<Tout, N> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \
+ { \
+ return __VA_ARGS__; \
+ }
+KFR_INTRIN_BITCAST(f32, i32, 4, _mm_castsi128_ps(x))
+KFR_INTRIN_BITCAST(i32, f32, 4, _mm_castps_si128(x))
+KFR_INTRIN_BITCAST(f64, i64, 2, _mm_castsi128_pd(x))
+KFR_INTRIN_BITCAST(i64, f64, 2, _mm_castpd_si128(x))
+
+#define KFR_INTRIN_BROADCAST(T, N, ...) \
+ KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, T value) CMT_NOEXCEPT { return __VA_ARGS__; }
+
+KFR_INTRIN_BROADCAST(i8, 16, _mm_set1_epi8(value))
+KFR_INTRIN_BROADCAST(i16, 8, _mm_set1_epi16(value))
+KFR_INTRIN_BROADCAST(i32, 4, _mm_set1_epi32(value))
+KFR_INTRIN_BROADCAST(i64, 2, _mm_set1_epi64x(value))
+KFR_INTRIN_BROADCAST(u8, 16, _mm_set1_epi8(value))
+KFR_INTRIN_BROADCAST(u16, 8, _mm_set1_epi16(value))
+KFR_INTRIN_BROADCAST(u32, 4, _mm_set1_epi32(value))
+KFR_INTRIN_BROADCAST(u64, 2, _mm_set1_epi64x(value))
+KFR_INTRIN_BROADCAST(f32, 4, _mm_set1_ps(value))
+KFR_INTRIN_BROADCAST(f64, 2, _mm_set1_pd(value))
+
+#define KFR_INTRIN_SHUFFLE_SWAP(T, N, ...) \
+ KFR_INTRINSIC simd<T, N> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N> ^ csize<1>, \
+ overload_priority<9>) CMT_NOEXCEPT \
+ { \
+ return __VA_ARGS__; \
+ }
+
+#define KFR_INTRIN_SHUFFLE_LINEAR(T, Nout, Nin, ...) \
+ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizeseq_t<Nout>, \
+ overload_priority<9>) CMT_NOEXCEPT \
+ { \
+ return __VA_ARGS__; \
+ }
+#define KFR_INTRIN_SHUFFLE_LINEAR_START(T, Nout, Nin, Nstart, ...) \
+ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, \
+ csizeseq_t<Nout, Nstart>, overload_priority<9>) CMT_NOEXCEPT \
+ { \
+ return __VA_ARGS__; \
+ }
+
+#define KFR_INTRIN_SHUFFLE_CONCAT(T, Nin, ...) \
+ KFR_INTRINSIC simd<T, Nin + Nin> simd_shuffle(simd2_t<T, Nin, Nin>, const simd<T, Nin>& x, \
+ const simd<T, Nin>& y, csizeseq_t<Nin + Nin>, \
+ overload_priority<9>) CMT_NOEXCEPT \
+ { \
+ return __VA_ARGS__; \
+ }
+
+// extend
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 1, _mm_cvtsi32_si128(u8(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 1, _mm_cvtsi32_si128(u16(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 1, _mm_cvtsi64_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 1, _mm_cvtsi32_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 1, _mm_cvtsi64_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 1, _mm_set_ss(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 1, _mm_set_sd(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 4, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 4, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 8, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 8, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 2, _mm_cvtsi32_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 4, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 4, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 2, _mm_cvtsi64_si128(x.whole))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 2, _mm_cvtsi64_si128(x.whole))
+
+// slice
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 1, 4, _mm_cvtsi128_si32(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 1, 4, _mm_cvtsi128_si32(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 4, 16, simd<i8, 4>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 8, 16, simd<i8, 8>::from(_mm_cvtsi128_si64(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 2, 16, simd<u8, 2>::from(u16(_mm_cvtsi128_si32(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 4, 16, simd<u8, 4>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 8, 16, simd<u8, 8>::from(_mm_cvtsi128_si64(x)))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 2, 8, simd<i16, 2>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 4, 8, simd<i16, 4>::from(_mm_cvtsi128_si64(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 2, 8, simd<u16, 2>::from(_mm_cvtsi128_si32(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 4, 8, simd<u16, 4>::from(_mm_cvtsi128_si64(x)))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 2, 4, simd<i32, 2>::from(_mm_cvtsi128_si64(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 2, 4, simd<u32, 2>::from(_mm_cvtsi128_si64(x)))
+
+// high
+KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 8, 16, 8, simd<u8, 8>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 8, 16, 8, simd<i8, 8>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 4, 8, 4, simd<u16, 4>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1)))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1)))
+
+#define KFR_INTRIN_CONVERT(Tout, Tin, N, ...) \
+ KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \
+ { \
+ return __VA_ARGS__; \
+ }
+
+KFR_INTRIN_CONVERT(f32, i32, 4, _mm_cvtepi32_ps(x))
+KFR_INTRIN_CONVERT(i32, f32, 4, _mm_cvttps_epi32(x))
+KFR_INTRIN_CONVERT(i32, f64, 2, simd<i32, 2>::from(_mm_cvtsi128_si64(_mm_cvttpd_epi32(x))))
+KFR_INTRIN_CONVERT(f64, i32, 2, _mm_cvtepi32_pd(KFR_mm_setr_epi64x(x.whole, 0)))
+KFR_INTRIN_CONVERT(i64, f64, 2, _mm_set_epi64x(_mm_cvttsd_si64(_mm_unpackhi_pd(x, x)), _mm_cvttsd_si64(x)))
+KFR_INTRIN_CONVERT(f64, i64, 2,
+ _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), _mm_cvtsi128_si64(x)),
+ _mm_cvtsi64_sd(_mm_setzero_pd(), KFR_i64sse_INDEX(x, 1))))
+#ifdef CMT_ARCH_AVX
+KFR_INTRIN_CONVERT(f64, f32, 4, _mm256_cvtps_pd(x))
+#else
+KFR_INTRIN_CONVERT(f64, f32, 4,
+ simd<f64, 4>{ _mm_cvtps_pd(x),
+ _mm_cvtps_pd(_mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2))) })
+#endif
+#ifdef CMT_ARCH_AVX
+KFR_INTRIN_CONVERT(f32, f64, 4, _mm256_cvtpd_ps(x))
+#else
+KFR_INTRIN_CONVERT(f32, f64, 4,
+ simd<f32, 4>{ _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtpd_ps(x.low)),
+ _mm_castps_pd(_mm_cvtpd_ps(x.high)))) })
+#endif
+#endif // CMT_ARCH_SSE2
+
+#ifdef CMT_ARCH_SSE41
+
+KFR_INTRIN_CONVERT(i16, i8, 8, _mm_cvtepi8_epi16(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u16, u8, 8, _mm_cvtepu8_epi16(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i32, i16, 4, _mm_cvtepi16_epi32(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u32, u16, 4, _mm_cvtepu16_epi32(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(i32, i8, 4, _mm_cvtepi8_epi32(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u32, u8, 4, _mm_cvtepu8_epi32(_mm_cvtsi32_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i64, i32, 2, _mm_cvtepi32_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u32, 2, _mm_cvtepu32_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(i64, i16, 2, _mm_cvtepi16_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u16, 2, _mm_cvtepu16_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(i64, i8, 2, _mm_cvtepi8_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u8, 2, _mm_cvtepu8_epi64(_mm_cvtsi32_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(f32, i8, 4, _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_cvtsi32_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, i16, 4, _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_cvtsi64_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, u8, 4, _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, u16, 4, _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_cvtsi64_si128(x.whole))))
+
+#ifndef CMT_ARCH_AVX
+KFR_INTRIN_CONVERT(i64, i32, 4,
+ simd<i64, 4>{ _mm_cvtepi32_epi64(x),
+ _mm_cvtepi32_epi64(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2))) })
+#endif
+#endif
+
+#ifdef CMT_ARCH_AVX
+KFR_INTRIN_MAKE(4, f64, _mm256_setr_pd)
+KFR_INTRIN_MAKE(8, f32, _mm256_setr_ps)
+
+KFR_INTRIN_BITCAST(f32, i32, 8, _mm256_castsi256_ps(x))
+
+KFR_INTRIN_BITCAST(i32, f32, 8, _mm256_castps_si256(x))
+KFR_INTRIN_BITCAST(f64, i64, 4, _mm256_castsi256_pd(x))
+KFR_INTRIN_BITCAST(i64, f64, 4, _mm256_castpd_si256(x))
+
+KFR_INTRINSIC __m256 KFR_mm256_setr_m128(__m128 x, __m128 y)
+{
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(x), y, 1);
+}
+
+KFR_INTRINSIC __m256d KFR_mm256_setr_m128d(__m128d x, __m128d y)
+{
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(x), y, 1);
+}
+KFR_INTRINSIC __m256i KFR_mm256_setr_m128i(__m128i x, __m128i y)
+{
+#ifdef CMT_ARCH_AVX2
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(x), y, 1);
+#else
+ return _mm256_castps_si256(
+ _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(x)), _mm_castsi128_ps(y), 1));
+#endif
+}
+
+KFR_INTRIN_SHUFFLE_CONCAT(f32, 4, KFR_mm256_setr_m128(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(f64, 2, KFR_mm256_setr_m128d(x, y))
+
+// concat
+KFR_INTRIN_SHUFFLE_CONCAT(i8, 16, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i16, 8, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i32, 4, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i64, 2, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u8, 16, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u16, 8, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u32, 4, KFR_mm256_setr_m128i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u64, 2, KFR_mm256_setr_m128i(x, y))
+// low
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 8, _mm256_castps256_ps128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2, 4, _mm256_castpd256_pd128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16, 32, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8, 16, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4, 8, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2, 4, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16, 32, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 16, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 8, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 4, _mm256_castsi256_si128(x))
+
+// extend
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 4, _mm256_castps128_ps256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 2, _mm256_castpd128_pd256(x))
+
+// high
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4, 8, 4, _mm256_extractf128_ps(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2, 4, 2, _mm256_extractf128_pd(x, 1))
+
+KFR_INTRIN_BROADCAST(f32, 8, _mm256_set1_ps(value))
+KFR_INTRIN_BROADCAST(f64, 4, _mm256_set1_pd(value))
+
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 8, 1, _mm256_castps128_ps256(_mm_set_ss(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 4, 1, _mm256_castpd128_pd256(_mm_set_sd(x)))
+#endif // CMT_ARCH_AVX
+
+#ifdef CMT_ARCH_AVX2
+KFR_INTRIN_MAKE(4, i64, _mm256_setr_epi64x)
+KFR_INTRIN_MAKE(4, u64, _mm256_setr_epi64x)
+KFR_INTRIN_MAKE(8, i32, _mm256_setr_epi32)
+KFR_INTRIN_MAKE(8, u32, _mm256_setr_epi32)
+KFR_INTRIN_MAKE(16, i16, _mm256_setr_epi16)
+KFR_INTRIN_MAKE(16, u16, _mm256_setr_epi16)
+KFR_INTRIN_MAKE(32, i8, _mm256_setr_epi8)
+KFR_INTRIN_MAKE(32, u8, _mm256_setr_epi8)
+
+KFR_INTRIN_CONVERT(i16, i8, 16, _mm256_cvtepi8_epi16(x))
+KFR_INTRIN_CONVERT(u16, u8, 16, _mm256_cvtepu8_epi16(x))
+
+KFR_INTRIN_CONVERT(i32, i16, 8, _mm256_cvtepi16_epi32(x))
+KFR_INTRIN_CONVERT(u32, u16, 8, _mm256_cvtepu16_epi32(x))
+KFR_INTRIN_CONVERT(i32, i8, 8, _mm256_cvtepi8_epi32(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u32, u8, 8, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i64, i32, 4, _mm256_cvtepi32_epi64(x))
+KFR_INTRIN_CONVERT(u64, u32, 4, _mm256_cvtepu32_epi64(x))
+KFR_INTRIN_CONVERT(i64, i16, 4, _mm256_cvtepi16_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u16, 4, _mm256_cvtepu16_epi64(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_CONVERT(i64, i8, 4, _mm256_cvtepi8_epi64(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_CONVERT(u64, u8, 4, _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(f32, i8, 8, _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_cvtsi64_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, i16, 8, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(x)))
+KFR_INTRIN_CONVERT(f32, u8, 8, _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_cvtsi64_si128(x.whole))))
+KFR_INTRIN_CONVERT(f32, u16, 8, _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(x)))
+
+KFR_INTRIN_SHUFFLE_LINEAR_START(i8, 16, 32, 16, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 8, 16, 8, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4, 8, 4, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2, 4, 2, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u8, 16, 32, 16, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u16, 8, 16, 8, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 4, 8, 4, _mm256_extracti128_si256(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(u64, 2, 4, 2, _mm256_extracti128_si256(x, 1))
+
+KFR_INTRIN_BROADCAST(i8, 32, _mm256_set1_epi8(value))
+KFR_INTRIN_BROADCAST(i16, 16, _mm256_set1_epi16(value))
+KFR_INTRIN_BROADCAST(i32, 8, _mm256_set1_epi32(value))
+KFR_INTRIN_BROADCAST(i64, 4, _mm256_set1_epi64x(value))
+KFR_INTRIN_BROADCAST(u8, 32, _mm256_set1_epi8(value))
+KFR_INTRIN_BROADCAST(u16, 16, _mm256_set1_epi16(value))
+KFR_INTRIN_BROADCAST(u32, 8, _mm256_set1_epi32(value))
+KFR_INTRIN_BROADCAST(u64, 4, _mm256_set1_epi64x(value))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 16, _mm256_castsi128_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 8, _mm256_castsi128_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 4, _mm256_castsi128_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 2, _mm256_castsi128_si256(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u8(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(u16(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 1, _mm256_castsi128_si256(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 8, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 4, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 2, _mm256_castsi128_si256(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i32, f32, 8, _mm256_cvttps_epi32(x))
+KFR_INTRIN_CONVERT(f32, i32, 8, _mm256_cvtepi32_ps(x))
+KFR_INTRIN_CONVERT(f64, i32, 4, _mm256_cvtepi32_pd(x))
+KFR_INTRIN_CONVERT(i32, f64, 4, _mm256_cvttpd_epi32(x))
+#endif // CMT_ARCH_AVX2
+
+#ifdef CMT_ARCH_AVX512
+
+static inline __m512d KFR_mm512_setr_pd(f64 x0, f64 x1, f64 x2, f64 x3, f64 x4, f64 x5, f64 x6, f64 x7)
+{
+ return _mm512_set_pd(x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512 KFR_mm512_setr_ps(f32 x0, f32 x1, f32 x2, f32 x3, f32 x4, f32 x5, f32 x6, f32 x7, f32 x8,
+ f32 x9, f32 x10, f32 x11, f32 x12, f32 x13, f32 x14, f32 x15)
+{
+ return _mm512_set_ps(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi64(i64 x0, i64 x1, i64 x2, i64 x3, i64 x4, i64 x5, i64 x6, i64 x7)
+{
+ return _mm512_set_epi64(x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi32(i32 x0, i32 x1, i32 x2, i32 x3, i32 x4, i32 x5, i32 x6, i32 x7,
+ i32 x8, i32 x9, i32 x10, i32 x11, i32 x12, i32 x13, i32 x14,
+ i32 x15)
+{
+ return _mm512_set_epi32(x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi16(i16 x0, i16 x1, i16 x2, i16 x3, i16 x4, i16 x5, i16 x6, i16 x7,
+ i16 x8, i16 x9, i16 x10, i16 x11, i16 x12, i16 x13, i16 x14,
+ i16 x15, i16 x16, i16 x17, i16 x18, i16 x19, i16 x20, i16 x21,
+ i16 x22, i16 x23, i16 x24, i16 x25, i16 x26, i16 x27, i16 x28,
+ i16 x29, i16 x30, i16 x31)
+{
+ return _mm512_set_epi16(x31, x30, x29, x28, x27, x26, x25, x24, x23, x22, x21, x20, x19, x18, x17, x16,
+ x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+static inline __m512i KFR_mm512_setr_epi8(i8 x0, i8 x1, i8 x2, i8 x3, i8 x4, i8 x5, i8 x6, i8 x7, i8 x8,
+ i8 x9, i8 x10, i8 x11, i8 x12, i8 x13, i8 x14, i8 x15, i8 x16,
+ i8 x17, i8 x18, i8 x19, i8 x20, i8 x21, i8 x22, i8 x23, i8 x24,
+ i8 x25, i8 x26, i8 x27, i8 x28, i8 x29, i8 x30, i8 x31, i8 x32,
+ i8 x33, i8 x34, i8 x35, i8 x36, i8 x37, i8 x38, i8 x39, i8 x40,
+ i8 x41, i8 x42, i8 x43, i8 x44, i8 x45, i8 x46, i8 x47, i8 x48,
+ i8 x49, i8 x50, i8 x51, i8 x52, i8 x53, i8 x54, i8 x55, i8 x56,
+ i8 x57, i8 x58, i8 x59, i8 x60, i8 x61, i8 x62, i8 x63)
+{
+ return _mm512_set_epi8(x63, x62, x61, x60, x59, x58, x57, x56, x55, x54, x53, x52, x51, x50, x49, x48,
+ x47, x46, x45, x44, x43, x42, x41, x40, x39, x38, x37, x36, x35, x34, x33, x32,
+ x31, x30, x29, x28, x27, x26, x25, x24, x23, x22, x21, x20, x19, x18, x17, x16,
+ x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0);
+}
+
+KFR_INTRINSIC __m512 KFR_mm512_setr_m256(__m256 x, __m256 y)
+{
+ return _mm512_insertf32x8(_mm512_castps256_ps512(x), y, 1);
+}
+
+KFR_INTRINSIC __m512d KFR_mm512_setr_m256d(__m256d x, __m256d y)
+{
+ return _mm512_insertf64x4(_mm512_castpd256_pd512(x), y, 1);
+}
+KFR_INTRINSIC __m512i KFR_mm512_setr_m256i(__m256i x, __m256i y)
+{
+ return _mm512_inserti32x8(_mm512_castsi256_si512(x), y, 1);
+}
+
+KFR_INTRIN_MAKE(8, f64, KFR_mm512_setr_pd)
+KFR_INTRIN_MAKE(16, f32, KFR_mm512_setr_ps)
+
+KFR_INTRIN_MAKE(8, i64, KFR_mm512_setr_epi64)
+KFR_INTRIN_MAKE(8, u64, KFR_mm512_setr_epi64)
+KFR_INTRIN_MAKE(16, i32, KFR_mm512_setr_epi32)
+KFR_INTRIN_MAKE(16, u32, KFR_mm512_setr_epi32)
+KFR_INTRIN_MAKE(32, i16, KFR_mm512_setr_epi16)
+KFR_INTRIN_MAKE(32, u16, KFR_mm512_setr_epi16)
+KFR_INTRIN_MAKE(64, i8, KFR_mm512_setr_epi8)
+KFR_INTRIN_MAKE(64, u8, KFR_mm512_setr_epi8)
+
+KFR_INTRIN_BROADCAST(f32, 16, _mm512_set1_ps(value))
+KFR_INTRIN_BROADCAST(f64, 8, _mm512_set1_pd(value))
+
+KFR_INTRIN_BROADCAST(i8, 64, _mm512_set1_epi8(value))
+KFR_INTRIN_BROADCAST(i16, 32, _mm512_set1_epi16(value))
+KFR_INTRIN_BROADCAST(i32, 16, _mm512_set1_epi32(value))
+KFR_INTRIN_BROADCAST(i64, 8, _mm512_set1_epi64(value))
+KFR_INTRIN_BROADCAST(u8, 64, _mm512_set1_epi8(value))
+KFR_INTRIN_BROADCAST(u16, 32, _mm512_set1_epi16(value))
+KFR_INTRIN_BROADCAST(u32, 16, _mm512_set1_epi32(value))
+KFR_INTRIN_BROADCAST(u64, 8, _mm512_set1_epi64(value))
+
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u8(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(u16(x))))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi32_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 4, 1, _mm512_castsi128_si512(_mm_cvtsi64_si128(x)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 8, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi32_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 4, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2, _mm512_castsi128_si512(_mm_cvtsi64_si128(x.whole)))
+
+KFR_INTRIN_CONVERT(i32, f32, 16, _mm512_cvttps_epi32(x))
+KFR_INTRIN_CONVERT(f32, i32, 16, _mm512_cvtepi32_ps(x))
+KFR_INTRIN_CONVERT(f64, i32, 8, _mm512_cvtepi32_pd(x))
+KFR_INTRIN_CONVERT(i32, f64, 8, _mm512_cvttpd_epi32(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 4, _mm512_castps128_ps512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2, _mm512_castpd128_pd512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 16, _mm512_castsi128_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 8, _mm512_castsi128_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 4, _mm512_castsi128_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2, _mm512_castsi128_si512(x))
+
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 4, 2 * 4, _mm512_castps256_ps512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 4, 2 * 2, _mm512_castpd256_pd512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 4, 2 * 16, _mm512_castsi256_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 4, 2 * 8, _mm512_castsi256_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 4, 2 * 4, _mm512_castsi256_si512(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 4, 2 * 2, _mm512_castsi256_si512(x))
+
+// low
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 8 * 2, _mm512_castps512_ps256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 4 * 2, _mm512_castpd512_pd256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(i64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u8, 16 * 2, 32 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u16, 8 * 2, 16 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u32, 4 * 2, 8 * 2, _mm512_castsi512_si256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(u64, 2 * 2, 4 * 2, _mm512_castsi512_si256(x))
+
+// high
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4 * 2, 8 * 2, 4 * 2, _mm512_extractf32x8_ps(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f64, 2 * 2, 4 * 2, 2 * 2, _mm512_extractf64x4_pd(x, 1))
+
+KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 4 * 2, 8 * 2, 4 * 2, _mm512_extracti32x8_epi32(x, 1))
+KFR_INTRIN_SHUFFLE_LINEAR_START(i64, 2 * 2, 4 * 2, 2 * 2, _mm512_extracti64x4_epi64(x, 1))
+
+// concat
+KFR_INTRIN_SHUFFLE_CONCAT(f32, 4 * 2, KFR_mm512_setr_m256(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(f64, 2 * 2, KFR_mm512_setr_m256d(x, y))
+
+KFR_INTRIN_SHUFFLE_CONCAT(i8, 16 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i16, 8 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i32, 4 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(i64, 2 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u8, 16 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u16, 8 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u32, 4 * 2, KFR_mm512_setr_m256i(x, y))
+KFR_INTRIN_SHUFFLE_CONCAT(u64, 2 * 2, KFR_mm512_setr_m256i(x, y))
+#endif
+
+#endif
+
+// generic functions
+
+template <typename T, size_t N1>
+KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT;
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount = csum(csizes<Ns...>)>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+ const simd<T, Ns>&... z) CMT_NOEXCEPT;
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd_array<T, N> to_simd_array(const simd<T, N>& x) CMT_NOEXCEPT
+{
+ return bitcast_anything<simd_array<T, N>>(x);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
+{
+ return bitcast_anything<simd<T, N>>(x);
+}
+
+#define KFR_COMPONENTWISE_RET(code) \
+ vec<T, N> result; \
+ for (size_t i = 0; i < N; i++) \
+ code; \
+ return result;
+
+#define KFR_COMPONENTWISE_RET_I(Tvec, code) \
+ Tvec result; \
+ for (size_t i = 0; i < result.size(); i++) \
+ code; \
+ return result;
+
+#define KFR_COMPONENTWISE(code) \
+ for (size_t i = 0; i < N; i++) \
+ code;
+
+template <typename Tout>
+KFR_INTRINSIC void simd_make(ctype_t<Tout>) CMT_NOEXCEPT = delete;
+
+template <typename Tout, typename Arg>
+KFR_INTRINSIC simd<Tout, 1> simd_make(ctype_t<Tout>, const Arg& arg) CMT_NOEXCEPT
+{
+ return simd<Tout, 1>{ static_cast<Tout>(arg) };
+}
+
+template <typename T, size_t... indices, typename... Args, size_t N = sizeof...(indices)>
+KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... args) CMT_NOEXCEPT;
+
+template <typename Tout, typename... Args, size_t N = sizeof...(Args), KFR_ENABLE_IF(N > 1)>
+KFR_INTRINSIC simd<Tout, N> simd_make(ctype_t<Tout>, const Args&... args) CMT_NOEXCEPT
+{
+ constexpr size_t Nlow = prev_poweroftwo(N - 1);
+ return simd_concat<Tout, Nlow, N - Nlow>(simd_make_helper<Tout>(csizeseq<Nlow>, args...),
+ simd_make_helper<Tout>(csizeseq<N - Nlow, Nlow>, args...));
+}
+
+template <typename T, size_t... indices, typename... Args, size_t N>
+KFR_INTRINSIC simd<T, N> simd_make_helper(csizes_t<indices...>, const Args&... args) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ const T temp[] = { static_cast<T>(args)... };
+ return simd_make(ctype<T>, temp[indices]...);
+}
+
+/// @brief Returns vector with undefined value
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_undefined() CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ simd<Tout, N> x;
+ return x;
+}
+
+/// @brief Returns vector with all zeros
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_zeros() CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ return from_simd_array<Tout, N>({ Tout() });
+}
+
+/// @brief Returns vector with all ones
+template <typename Tout, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_allones() CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ simd_array<Tout, N> x{};
+ KFR_COMPONENTWISE(x.val[i] = special_constants<Tout>::allones());
+ return from_simd_array(x);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout)),
+ KFR_ENABLE_IF(Nout == 1 || N == 1)>
+KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ return bitcast_anything<simd<Tout, Nout>>(x);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N, size_t Nout = (sizeof(Tin) * N / sizeof(Tout)),
+ KFR_ENABLE_IF(Nout > 1 && N > 1)>
+KFR_INTRINSIC simd<Tout, Nout> simd_bitcast(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT
+{
+ constexpr size_t Nlow = prev_poweroftwo(N - 1);
+ return simd_concat<Tout, Nlow * Nout / N, (N - Nlow) * Nout / N>(
+ simd_bitcast(simd_cvt_t<Tout, Tin, Nlow>{},
+ simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)),
+ simd_bitcast(simd_cvt_t<Tout, Tin, N - Nlow>{},
+ simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto)));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT
+{
+ return x;
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ return to_simd_array<T, N>(value).val[index];
+}
+
+template <typename T, size_t N, size_t index>
+KFR_INTRINSIC simd<T, N> simd_set_element(simd<T, N> value, csize_t<index>, T x) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ simd_array<T, N> arr = to_simd_array<T, N>(value);
+ arr.val[index] = x;
+ return from_simd_array(arr);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC const simd<T, N>& simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N>,
+ overload_priority<10>) CMT_NOEXCEPT
+{
+ return x;
+}
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC const simd<T, N1>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>&,
+ csizeseq_t<N1>, overload_priority<9>) CMT_NOEXCEPT
+{
+ return x;
+}
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC const simd<T, N2>& simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>&, const simd<T, N2>& y,
+ csizeseq_t<N2, N1>, overload_priority<9>) CMT_NOEXCEPT
+{
+ return y;
+}
+
+// concat()
+template <typename T, size_t N,
+ KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N + N>, simd_halves<T, N + N>>::value)>
+KFR_INTRINSIC simd<T, N + N> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y,
+ csizeseq_t<N + N>, overload_priority<8>) CMT_NOEXCEPT
+{
+ return simd<T, N + N>{ x, y };
+}
+
+template <typename T>
+KFR_INTRINSIC simd<T, 1> simd_broadcast(simd_t<T, 1>, identity<T> value) CMT_NOEXCEPT
+{
+ return { value };
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N >= 2), size_t Nlow = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC simd<T, N> simd_broadcast(simd_t<T, N>, identity<T> value) CMT_NOEXCEPT
+{
+ return simd_concat<T, Nlow, N - Nlow>(simd_broadcast(simd_t<T, Nlow>{}, value),
+ simd_broadcast(simd_t<T, N - Nlow>{}, value));
+}
+
+template <typename T, size_t N,
+ KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N>, simd_halves<T, N>>::value)>
+KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2>,
+ overload_priority<7>) CMT_NOEXCEPT
+{
+ return x.low;
+}
+
+template <typename T, size_t N,
+ KFR_ENABLE_IF(is_poweroftwo(N) && is_same<simd<T, N>, simd_halves<T, N>>::value)>
+KFR_INTRINSIC simd<T, N / 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizeseq_t<N / 2, N / 2>,
+ overload_priority<7>) CMT_NOEXCEPT
+{
+ return x.high;
+}
+
+template <typename T, size_t Nout, size_t N>
+simd_array<T, Nout> simd_shuffle_generic(const simd_array<T, N>& x, const unsigned (&indices)[Nout])
+{
+ simd_array<T, Nout> result;
+ for (size_t i = 0; i < Nout; ++i)
+ {
+ const size_t index = indices[i];
+ result.val[i] = index >= N ? T() : x.val[index];
+ }
+ return result;
+}
+
+template <typename T, size_t Nout, size_t N1, size_t N2>
+simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd_array<T, N2>& y,
+ const unsigned (&indices)[Nout])
+{
+ simd_array<T, Nout> result;
+ for (size_t i = 0; i < Nout; ++i)
+ {
+ const size_t index = indices[i];
+ result.val[i] = index > N1 + N2 ? T() : index >= N1 ? y.val[index - N1] : x.val[index];
+ }
+ return result;
+}
+
+template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>,
+ overload_generic) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+#ifdef CMT_COMPILER_MSVC
+ const simd_array<T, N> xx = to_simd_array<T, N>(x);
+ constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+ return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, N>(xx, indices_array));
+#else
+ return from_simd_array<T, Nout>({ (indices > N ? T() : to_simd_array<T, N>(x).val[indices])... });
+#endif
+}
+
+template <typename T, size_t N, size_t N2 = N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, const simd<T, N>& y,
+ csizes_t<indices...>, overload_generic) CMT_NOEXCEPT
+{
+ static_assert(N == N2, "");
+ not_optimized(CMT_FUNC_SIGNATURE);
+#ifdef CMT_COMPILER_MSVC
+ const simd_array<T, N> xx = to_simd_array<T, N>(x);
+ const simd_array<T, N> yy = to_simd_array<T, N>(y);
+ constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+ return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N, N>(xx, yy, indices_array));
+#else
+ return from_simd_array<T, Nout>(
+ { (indices > N * 2 ? T()
+ : indices >= N ? to_simd_array<T, N>(y).val[indices - N]
+ : to_simd_array<T, N>(x).val[indices])... });
+#endif
+}
+
+template <typename T, size_t N1, size_t N2, size_t... indices, KFR_ENABLE_IF(N1 != N2),
+ size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& x, const simd<T, N2>& y,
+ csizes_t<indices...>, overload_generic) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+
+#ifdef CMT_COMPILER_MSVC
+ const simd_array<T, N1> xx = to_simd_array<T, N1>(x);
+ const simd_array<T, N2> yy = to_simd_array<T, N2>(y);
+ constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+ return from_simd_array<T, Nout>(simd_shuffle2_generic<T, Nout, N1, N2>(xx, yy, indices_array));
+#else
+
+ return from_simd_array<T, Nout>(
+ { (indices > N1 + N2 ? T()
+ : indices >= N1 ? to_simd_array<T, N2>(y).val[indices - N1]
+ : to_simd_array<T, N1>(x).val[indices])... });
+#endif
+}
+
+template <typename T, size_t N1>
+KFR_INTRINSIC const simd<T, N1>& simd_concat(const simd<T, N1>& x) CMT_NOEXCEPT
+{
+ return x;
+}
+
+template <typename T, size_t N1, size_t N2, size_t... Ns, size_t Nscount /*= csum(csizes<Ns...>)*/>
+KFR_INTRINSIC simd<T, N1 + N2 + Nscount> simd_concat(const simd<T, N1>& x, const simd<T, N2>& y,
+ const simd<T, Ns>&... z) CMT_NOEXCEPT
+{
+ return simd_shuffle(simd2_t<T, N1, N2 + Nscount>{}, x, simd_concat<T, N2, Ns...>(y, z...),
+ csizeseq<N1 + N2 + Nscount>, overload_auto);
+}
+
+template <typename Tout, typename Tin, size_t N, size_t... indices>
+KFR_INTRINSIC simd<Tout, N> simd_convert__(const simd<Tin, N>& x, csizes_t<indices...>) CMT_NOEXCEPT
+{
+ const simd_array<Tin, N> xx = to_simd_array<Tin, N>(x);
+ return simd_make(ctype<Tout>, static_cast<Tout>(xx.val[indices])...);
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin>
+KFR_INTRINSIC simd<Tout, 1> simd_convert(simd_cvt_t<Tout, Tin, 1>, const simd<Tin, 1>& x) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ return simd_make(ctype<Tout>, static_cast<Tout>(x));
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename Tout, typename Tin, size_t N>
+KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT
+{
+ constexpr size_t Nlow = prev_poweroftwo(N - 1);
+ return simd_concat<Tout, Nlow, N - Nlow>(
+ simd_convert(simd_cvt_t<Tout, Tin, Nlow>{},
+ simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<Nlow>, overload_auto)),
+ simd_convert(simd_cvt_t<Tout, Tin, N - Nlow>{},
+ simd_shuffle(simd_t<Tin, N>{}, x, csizeseq<N - Nlow, Nlow>, overload_auto)));
+}
+
+/// @brief Converts input vector to vector with subtype Tout
+template <typename T, size_t N>
+KFR_INTRINSIC const simd<T, N>& simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) CMT_NOEXCEPT
+{
+ return x;
+}
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-attributes")
+
+template <typename T, size_t N, bool A>
+using simd_storage = struct_with_alignment<simd<T, N>, A>;
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
+{
+ return reinterpret_cast<typename simd_storage<T, N, A>::const_pointer>(src)->value;
+}
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
+{
+ constexpr size_t first = prev_poweroftwo(N);
+ constexpr size_t rest = N - first;
+ constexpr auto extend_indices =
+ cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
+ constexpr auto concat_indices = cvalseq_t<size_t, N>();
+ return simd_shuffle(
+ simd2_t<T, first, first>{}, simd_read<first, A>(src),
+ simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
+ concat_indices, overload_auto);
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
+{
+ reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = value;
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
+{
+ constexpr size_t first = prev_poweroftwo(N);
+ constexpr size_t rest = N - first;
+ simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<first>(), overload_auto));
+ simd_write<false, rest>(dest + first,
+ simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<rest, first>(), overload_auto));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ return to_simd_array<T, N>(value).val[index];
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, T x) CMT_NOEXCEPT
+{
+ not_optimized(CMT_FUNC_SIGNATURE);
+ simd_array<T, N> arr = to_simd_array<T, N>(value);
+ arr.val[index] = x;
+ return from_simd_array(arr);
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/impl/basicoperators_clang.hpp b/include/kfr/simd/impl/basicoperators_clang.hpp
@@ -0,0 +1,178 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../mask.hpp"
+#include "function.hpp"
+#include <algorithm>
+#include <utility>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x)
+{
+ return -x.v;
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x)
+{
+ return simd_bitcast(simd_cvt_t<T, utype<T>, N>{}, ~simd_bitcast(simd_cvt_t<utype<T>, T, N>{}, x.v));
+}
+
+#define KFR_OP_SCALAR2(fn, op, resultprefix, operprefix, soperprefix) \
+ template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& x, const T& y) \
+ { \
+ return resultprefix(operprefix(x.v) op soperprefix(y)); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const T& x, const vec<T, N>& y) \
+ { \
+ return resultprefix(soperprefix(x) op operprefix(y.v)); \
+ }
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> add(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.v + y.v;
+}
+KFR_OP_SCALAR2(add, +, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> sub(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.v - y.v;
+}
+KFR_OP_SCALAR2(sub, -, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> mul(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.v * y.v;
+}
+KFR_OP_SCALAR2(mul, *, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> div(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.v / y.v;
+}
+KFR_OP_SCALAR2(div, /, , , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> band(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)((simd<utype<T>, N>)(x.v) & (simd<utype<T>, N>)(y.v));
+}
+KFR_OP_SCALAR2(band, &, (simd<T, N>), (simd<utype<T>, N>), ubitcast)
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bor(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)((simd<utype<T>, N>)(x.v) | (simd<utype<T>, N>)(y.v));
+}
+KFR_OP_SCALAR2(bor, |, (simd<T, N>), (simd<utype<T>, N>), ubitcast)
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bxor(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)((simd<utype<T>, N>)(x.v) ^ (simd<utype<T>, N>)(y.v));
+}
+KFR_OP_SCALAR2(bxor, ^, (simd<T, N>), (simd<utype<T>, N>), ubitcast)
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+ return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) << y.v);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+ return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) >> y.v);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, unsigned y)
+{
+ return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) << y);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y)
+{
+ return (simd<T, N>)((simd<uitype<deep_subtype<T>>, N * sizeof(deep_subtype<T>) / sizeof(T)>)(x.v) >> y);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)(x.v == y.v);
+}
+KFR_OP_SCALAR2(eq, ==, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)(x.v != y.v);
+}
+KFR_OP_SCALAR2(ne, !=, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)(x.v <= y.v);
+}
+KFR_OP_SCALAR2(le, <=, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)(x.v >= y.v);
+}
+KFR_OP_SCALAR2(ge, >=, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)(x.v < y.v);
+}
+KFR_OP_SCALAR2(lt, <, (simd<T, N>), , )
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (simd<T, N>)(x.v > y.v);
+}
+KFR_OP_SCALAR2(gt, >, (simd<T, N>), , )
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/basicoperators_generic.hpp b/include/kfr/simd/impl/basicoperators_generic.hpp
@@ -0,0 +1,1674 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../mask.hpp"
+#include "function.hpp"
+#include <algorithm>
+#include <utility>
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4700))
+CMT_PRAGMA_MSVC(warning(disable : 4309))
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+#if defined CMT_ARCH_SSE2 && defined KFR_NATIVE_INTRINSICS
+
+KFR_INTRINSIC __m128 _mm_allones_ps()
+{
+ return _mm_castsi128_ps(_mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+
+KFR_INTRINSIC __m128d _mm_allones_pd()
+{
+ return _mm_castsi128_pd(_mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+
+KFR_INTRINSIC __m128i _mm_allones_si128() { return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); }
+
+KFR_INTRINSIC __m128 _mm_not_ps(const __m128& x) { return _mm_xor_ps(x, _mm_allones_ps()); }
+
+KFR_INTRINSIC __m128d _mm_not_pd(const __m128d& x) { return _mm_xor_pd(x, _mm_allones_pd()); }
+
+KFR_INTRINSIC __m128i _mm_not_si128(const __m128i& x) { return _mm_xor_si128(x, _mm_allones_si128()); }
+
+KFR_INTRINSIC __m128i _mm_highbit_epi8() { return _mm_set1_epi8(static_cast<char>(0x80)); }
+KFR_INTRINSIC __m128i _mm_highbit_epi16() { return _mm_set1_epi16(static_cast<short>(0x8000)); }
+KFR_INTRINSIC __m128i _mm_highbit_epi32() { return _mm_set1_epi32(static_cast<int>(0x80000000)); }
+KFR_INTRINSIC __m128i _mm_highbit_epi64() { return _mm_set1_epi64x(0x8000000000000000ll); }
+
+KFR_INTRINSIC f32sse add(const f32sse& x, const f32sse& y) { return f32sse(_mm_add_ps(x.v, y.v)); }
+KFR_INTRINSIC f32sse sub(const f32sse& x, const f32sse& y) { return f32sse(_mm_sub_ps(x.v, y.v)); }
+KFR_INTRINSIC f32sse mul(const f32sse& x, const f32sse& y) { return f32sse(_mm_mul_ps(x.v, y.v)); }
+KFR_INTRINSIC f32sse div(const f32sse& x, const f32sse& y) { return f32sse(_mm_div_ps(x.v, y.v)); }
+
+KFR_INTRINSIC f64sse add(const f64sse& x, const f64sse& y) { return f64sse(_mm_add_pd(x.v, y.v)); }
+KFR_INTRINSIC f64sse sub(const f64sse& x, const f64sse& y) { return f64sse(_mm_sub_pd(x.v, y.v)); }
+KFR_INTRINSIC f64sse mul(const f64sse& x, const f64sse& y) { return f64sse(_mm_mul_pd(x.v, y.v)); }
+KFR_INTRINSIC f64sse div(const f64sse& x, const f64sse& y) { return f64sse(_mm_div_pd(x.v, y.v)); }
+
+KFR_INTRINSIC u8sse add(const u8sse& x, const u8sse& y) { return _mm_add_epi8(x.v, y.v); }
+KFR_INTRINSIC u8sse sub(const u8sse& x, const u8sse& y) { return _mm_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC u8sse div(const u8sse& x, const u8sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u8sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i8sse add(const i8sse& x, const i8sse& y) { return _mm_add_epi8(x.v, y.v); }
+KFR_INTRINSIC i8sse sub(const i8sse& x, const i8sse& y) { return _mm_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC i8sse div(const i8sse& x, const i8sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i8sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC __m128i mul_epi8(const __m128i& x, const __m128i& y)
+{
+ const __m128i even = _mm_mullo_epi16(x, y);
+ const __m128i odd = _mm_mullo_epi16(_mm_srli_epi16(x, 8), _mm_srli_epi16(y, 8));
+ return _mm_or_si128(_mm_slli_epi16(odd, 8), _mm_srli_epi16(_mm_slli_epi16(even, 8), 8));
+}
+
+KFR_INTRINSIC u8sse mul(const u8sse& x, const u8sse& y) { return mul_epi8(x.v, y.v); }
+
+KFR_INTRINSIC i8sse mul(const i8sse& x, const i8sse& y) { return mul_epi8(x.v, y.v); }
+
+KFR_INTRINSIC u16sse add(const u16sse& x, const u16sse& y) { return _mm_add_epi16(x.v, y.v); }
+KFR_INTRINSIC u16sse sub(const u16sse& x, const u16sse& y) { return _mm_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC u16sse mul(const u16sse& x, const u16sse& y) { return _mm_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC u16sse div(const u16sse& x, const u16sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u16sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i16sse add(const i16sse& x, const i16sse& y) { return _mm_add_epi16(x.v, y.v); }
+KFR_INTRINSIC i16sse sub(const i16sse& x, const i16sse& y) { return _mm_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC i16sse mul(const i16sse& x, const i16sse& y) { return _mm_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC i16sse div(const i16sse& x, const i16sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i16sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC u32sse add(const u32sse& x, const u32sse& y) { return _mm_add_epi32(x.v, y.v); }
+KFR_INTRINSIC u32sse sub(const u32sse& x, const u32sse& y) { return _mm_sub_epi32(x.v, y.v); }
+
+KFR_INTRINSIC i32sse add(const i32sse& x, const i32sse& y) { return _mm_add_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse sub(const i32sse& x, const i32sse& y) { return _mm_sub_epi32(x.v, y.v); }
+
+#if defined CMT_ARCH_SSE41
+KFR_INTRINSIC u32sse mul(const u32sse& x, const u32sse& y) { return _mm_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse mul(const i32sse& x, const i32sse& y) { return _mm_mullo_epi32(x.v, y.v); }
+#else
+KFR_INTRINSIC u32sse mul(const u32sse& x, const u32sse& y)
+{
+ __m128i tmp1 = _mm_mul_epu32(x.v, y.v);
+ __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x.v, 4), _mm_srli_si128(y.v, 4));
+ return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),
+ _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+KFR_INTRINSIC i32sse mul(const i32sse& x, const i32sse& y)
+{
+ __m128i tmp1 = _mm_mul_epu32(x.v, y.v);
+ __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(x.v, 4), _mm_srli_si128(y.v, 4));
+ return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),
+ _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+#endif
+KFR_INTRINSIC u32sse div(const u32sse& x, const u32sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u32sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i32sse div(const i32sse& x, const i32sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i32sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC u64sse add(const u64sse& x, const u64sse& y) { return _mm_add_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse sub(const u64sse& x, const u64sse& y) { return _mm_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse mul(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC u64sse div(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i64sse add(const i64sse& x, const i64sse& y) { return _mm_add_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse sub(const i64sse& x, const i64sse& y) { return _mm_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse mul(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC i64sse div(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC f32sse shl(const f32sse& x, unsigned y)
+{
+ return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(x.v), y));
+}
+KFR_INTRINSIC f64sse shl(const f64sse& x, unsigned y)
+{
+ return _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(x.v), y));
+}
+KFR_INTRINSIC f32sse shr(const f32sse& x, unsigned y)
+{
+ return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(x.v), y));
+}
+KFR_INTRINSIC f64sse shr(const f64sse& x, unsigned y)
+{
+ return _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(x.v), y));
+}
+
+KFR_INTRINSIC u16sse shl(const u16sse& x, unsigned y) { return _mm_slli_epi16(x.v, y); }
+KFR_INTRINSIC u32sse shl(const u32sse& x, unsigned y) { return _mm_slli_epi32(x.v, y); }
+KFR_INTRINSIC u64sse shl(const u64sse& x, unsigned y) { return _mm_slli_epi64(x.v, y); }
+KFR_INTRINSIC i16sse shl(const i16sse& x, unsigned y) { return _mm_slli_epi16(x.v, y); }
+KFR_INTRINSIC i32sse shl(const i32sse& x, unsigned y) { return _mm_slli_epi32(x.v, y); }
+KFR_INTRINSIC i64sse shl(const i64sse& x, unsigned y) { return _mm_slli_epi64(x.v, y); }
+
+KFR_INTRINSIC u16sse shr(const u16sse& x, unsigned y) { return _mm_srli_epi16(x.v, y); }
+KFR_INTRINSIC u32sse shr(const u32sse& x, unsigned y) { return _mm_srli_epi32(x.v, y); }
+KFR_INTRINSIC u64sse shr(const u64sse& x, unsigned y) { return _mm_srli_epi64(x.v, y); }
+KFR_INTRINSIC i16sse shr(const i16sse& x, unsigned y) { return _mm_srai_epi16(x.v, y); }
+KFR_INTRINSIC i32sse shr(const i32sse& x, unsigned y) { return _mm_srai_epi32(x.v, y); }
+
+KFR_INTRINSIC u8sse shl(const u8sse& x, unsigned y)
+{
+ __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+ __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+ __m128i ll = _mm_slli_epi16(l, y);
+ __m128i hh = _mm_slli_epi16(h, y);
+
+ return _mm_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8sse shl(const i8sse& x, unsigned y)
+{
+ __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+ __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+ __m128i ll = _mm_slli_epi16(l, y);
+ __m128i hh = _mm_slli_epi16(h, y);
+
+ return _mm_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC u8sse shr(const u8sse& x, unsigned y)
+{
+ __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+ __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+ __m128i ll = _mm_srli_epi16(l, y);
+ __m128i hh = _mm_srli_epi16(h, y);
+
+ return _mm_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8sse shr(const i8sse& x, unsigned y)
+{
+ __m128i l = _mm_unpacklo_epi8(_mm_setzero_si128(), x.v);
+ __m128i h = _mm_unpackhi_epi8(_mm_setzero_si128(), x.v);
+
+ __m128i ll = _mm_srai_epi16(l, y);
+ __m128i hh = _mm_srai_epi16(h, y);
+
+ return _mm_packs_epi16(ll, hh);
+}
+
+KFR_INTRINSIC i64sse shr(const i64sse& x, unsigned y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = x[i] >> y);
+}
+
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y[i])));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y[i])));
+}
+
+KFR_INTRINSIC f32sse band(const f32sse& x, const f32sse& y) { return _mm_and_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse band(const f64sse& x, const f64sse& y) { return _mm_and_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8sse band(const u8sse& x, const u8sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC u16sse band(const u16sse& x, const u16sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC u32sse band(const u32sse& x, const u32sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC u64sse band(const u64sse& x, const u64sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i8sse band(const i8sse& x, const i8sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i16sse band(const i16sse& x, const i16sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i32sse band(const i32sse& x, const i32sse& y) { return _mm_and_si128(x.v, y.v); }
+KFR_INTRINSIC i64sse band(const i64sse& x, const i64sse& y) { return _mm_and_si128(x.v, y.v); }
+
+KFR_INTRINSIC f32sse bor(const f32sse& x, const f32sse& y) { return _mm_or_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse bor(const f64sse& x, const f64sse& y) { return _mm_or_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8sse bor(const u8sse& x, const u8sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC u16sse bor(const u16sse& x, const u16sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC u32sse bor(const u32sse& x, const u32sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC u64sse bor(const u64sse& x, const u64sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i8sse bor(const i8sse& x, const i8sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i16sse bor(const i16sse& x, const i16sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i32sse bor(const i32sse& x, const i32sse& y) { return _mm_or_si128(x.v, y.v); }
+KFR_INTRINSIC i64sse bor(const i64sse& x, const i64sse& y) { return _mm_or_si128(x.v, y.v); }
+
+KFR_INTRINSIC f32sse bxor(const f32sse& x, const f32sse& y) { return _mm_xor_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse bxor(const f64sse& x, const f64sse& y) { return _mm_xor_pd(x.v, y.v); }
+
+KFR_INTRINSIC u8sse bxor(const u8sse& x, const u8sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC u16sse bxor(const u16sse& x, const u16sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC u32sse bxor(const u32sse& x, const u32sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC u64sse bxor(const u64sse& x, const u64sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i8sse bxor(const i8sse& x, const i8sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i16sse bxor(const i16sse& x, const i16sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i32sse bxor(const i32sse& x, const i32sse& y) { return _mm_xor_si128(x.v, y.v); }
+KFR_INTRINSIC i64sse bxor(const i64sse& x, const i64sse& y) { return _mm_xor_si128(x.v, y.v); }
+
+KFR_INTRINSIC f32sse eq(const f32sse& x, const f32sse& y) { return _mm_cmpeq_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse eq(const f64sse& x, const f64sse& y) { return _mm_cmpeq_pd(x.v, y.v); }
+KFR_INTRINSIC u8sse eq(const u8sse& x, const u8sse& y) { return _mm_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC u16sse eq(const u16sse& x, const u16sse& y) { return _mm_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC u32sse eq(const u32sse& x, const u32sse& y) { return _mm_cmpeq_epi32(x.v, y.v); }
+KFR_INTRINSIC i8sse eq(const i8sse& x, const i8sse& y) { return _mm_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC i16sse eq(const i16sse& x, const i16sse& y) { return _mm_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC i32sse eq(const i32sse& x, const i32sse& y) { return _mm_cmpeq_epi32(x.v, y.v); }
+
+KFR_INTRINSIC f32sse ne(const f32sse& x, const f32sse& y) { return _mm_not_ps(_mm_cmpeq_ps(x.v, y.v)); }
+KFR_INTRINSIC f64sse ne(const f64sse& x, const f64sse& y) { return _mm_not_pd(_mm_cmpeq_pd(x.v, y.v)); }
+KFR_INTRINSIC u8sse ne(const u8sse& x, const u8sse& y) { return _mm_not_si128(_mm_cmpeq_epi8(x.v, y.v)); }
+KFR_INTRINSIC u16sse ne(const u16sse& x, const u16sse& y) { return _mm_not_si128(_mm_cmpeq_epi16(x.v, y.v)); }
+KFR_INTRINSIC u32sse ne(const u32sse& x, const u32sse& y) { return _mm_not_si128(_mm_cmpeq_epi32(x.v, y.v)); }
+KFR_INTRINSIC i8sse ne(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmpeq_epi8(x.v, y.v)); }
+KFR_INTRINSIC i16sse ne(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmpeq_epi16(x.v, y.v)); }
+KFR_INTRINSIC i32sse ne(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmpeq_epi32(x.v, y.v)); }
+
+KFR_INTRINSIC f32sse lt(const f32sse& x, const f32sse& y) { return _mm_cmplt_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse lt(const f64sse& x, const f64sse& y) { return _mm_cmplt_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse lt(const i8sse& x, const i8sse& y) { return _mm_cmplt_epi8(x.v, y.v); }
+KFR_INTRINSIC i16sse lt(const i16sse& x, const i16sse& y) { return _mm_cmplt_epi16(x.v, y.v); }
+KFR_INTRINSIC i32sse lt(const i32sse& x, const i32sse& y) { return _mm_cmplt_epi32(x.v, y.v); }
+
+KFR_INTRINSIC u8sse lt(const u8sse& x, const u8sse& y)
+{
+ const __m128i hb = _mm_highbit_epi8();
+ return _mm_cmplt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb));
+}
+
+KFR_INTRINSIC u16sse lt(const u16sse& x, const u16sse& y)
+{
+ const __m128i hb = _mm_highbit_epi16();
+ return _mm_cmplt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb));
+}
+KFR_INTRINSIC u32sse lt(const u32sse& x, const u32sse& y)
+{
+ const __m128i hb = _mm_highbit_epi32();
+ return _mm_cmplt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb));
+}
+
+KFR_INTRINSIC f32sse gt(const f32sse& x, const f32sse& y) { return _mm_cmpgt_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse gt(const f64sse& x, const f64sse& y) { return _mm_cmpgt_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse gt(const i8sse& x, const i8sse& y) { return _mm_cmpgt_epi8(x.v, y.v); }
+KFR_INTRINSIC i16sse gt(const i16sse& x, const i16sse& y) { return _mm_cmpgt_epi16(x.v, y.v); }
+KFR_INTRINSIC i32sse gt(const i32sse& x, const i32sse& y) { return _mm_cmpgt_epi32(x.v, y.v); }
+
+KFR_INTRINSIC u8sse gt(const u8sse& x, const u8sse& y)
+{
+ const __m128i hb = _mm_highbit_epi8();
+ return _mm_cmpgt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb));
+}
+
+KFR_INTRINSIC u16sse gt(const u16sse& x, const u16sse& y)
+{
+ const __m128i hb = _mm_highbit_epi16();
+ return _mm_cmpgt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb));
+}
+KFR_INTRINSIC u32sse gt(const u32sse& x, const u32sse& y)
+{
+ const __m128i hb = _mm_highbit_epi32();
+ return _mm_cmpgt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb));
+}
+
+KFR_INTRINSIC f32sse le(const f32sse& x, const f32sse& y) { return _mm_cmple_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse le(const f64sse& x, const f64sse& y) { return _mm_cmple_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse le(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmpgt_epi8(x.v, y.v)); }
+KFR_INTRINSIC i16sse le(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmpgt_epi16(x.v, y.v)); }
+KFR_INTRINSIC i32sse le(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmpgt_epi32(x.v, y.v)); }
+
+KFR_INTRINSIC u8sse le(const u8sse& x, const u8sse& y)
+{
+ const __m128i hb = _mm_highbit_epi8();
+ return _mm_not_si128(_mm_cmpgt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb)));
+}
+
+KFR_INTRINSIC u16sse le(const u16sse& x, const u16sse& y)
+{
+ const __m128i hb = _mm_highbit_epi16();
+ return _mm_not_si128(_mm_cmpgt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb)));
+}
+KFR_INTRINSIC u32sse le(const u32sse& x, const u32sse& y)
+{
+ const __m128i hb = _mm_highbit_epi32();
+ return _mm_not_si128(_mm_cmpgt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb)));
+}
+
+KFR_INTRINSIC f32sse ge(const f32sse& x, const f32sse& y) { return _mm_cmpge_ps(x.v, y.v); }
+KFR_INTRINSIC f64sse ge(const f64sse& x, const f64sse& y) { return _mm_cmpge_pd(x.v, y.v); }
+KFR_INTRINSIC i8sse ge(const i8sse& x, const i8sse& y) { return _mm_not_si128(_mm_cmplt_epi8(x.v, y.v)); }
+KFR_INTRINSIC i16sse ge(const i16sse& x, const i16sse& y) { return _mm_not_si128(_mm_cmplt_epi16(x.v, y.v)); }
+KFR_INTRINSIC i32sse ge(const i32sse& x, const i32sse& y) { return _mm_not_si128(_mm_cmplt_epi32(x.v, y.v)); }
+
+KFR_INTRINSIC u8sse ge(const u8sse& x, const u8sse& y)
+{
+ const __m128i hb = _mm_highbit_epi8();
+ return _mm_not_si128(_mm_cmplt_epi8(_mm_add_epi8(x.v, hb), _mm_add_epi8(y.v, hb)));
+}
+
+KFR_INTRINSIC u16sse ge(const u16sse& x, const u16sse& y)
+{
+ const __m128i hb = _mm_highbit_epi16();
+ return _mm_not_si128(_mm_cmplt_epi16(_mm_add_epi16(x.v, hb), _mm_add_epi16(y.v, hb)));
+}
+KFR_INTRINSIC u32sse ge(const u32sse& x, const u32sse& y)
+{
+ const __m128i hb = _mm_highbit_epi32();
+ return _mm_not_si128(_mm_cmplt_epi32(_mm_add_epi32(x.v, hb), _mm_add_epi32(y.v, hb)));
+}
+
+#if defined CMT_ARCH_SSE41 && defined KFR_NATIVE_INTRINSICS
+KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y) { return _mm_cmpeq_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y) { return _mm_cmpeq_epi64(x.v, y.v); }
+KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y) { return _mm_not_si128(_mm_cmpeq_epi64(x.v, y.v)); }
+KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpeq_epi64(x.v, y.v)); }
+#else
+KFR_INTRINSIC u64sse eq(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] == y[i]));
+}
+KFR_INTRINSIC i64sse eq(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] == y[i]));
+}
+KFR_INTRINSIC u64sse ne(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] != y[i]));
+}
+KFR_INTRINSIC i64sse ne(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] != y[i]));
+}
+#endif
+
+#if defined CMT_ARCH_SSE42
+KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y) { return _mm_cmpgt_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y) { return _mm_cmpgt_epi64(y.v, x.v); }
+KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpgt_epi64(y.v, x.v)); }
+KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y) { return _mm_not_si128(_mm_cmpgt_epi64(x.v, y.v)); }
+
+KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y)
+{
+ const __m128i hb = _mm_highbit_epi64();
+ return _mm_cmpgt_epi64(_mm_add_epi64(x.v, hb), _mm_add_epi64(y.v, hb));
+}
+KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y)
+{
+ const __m128i hb = _mm_highbit_epi64();
+ return _mm_cmpgt_epi64(_mm_add_epi64(y.v, hb), _mm_add_epi64(x.v, hb));
+}
+KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y)
+{
+ const __m128i hb = _mm_highbit_epi64();
+ return _mm_not_si128(_mm_cmpgt_epi64(_mm_add_epi64(y.v, hb), _mm_add_epi64(x.v, hb)));
+}
+KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
+{
+ const __m128i hb = _mm_highbit_epi64();
+ return _mm_not_si128(_mm_cmpgt_epi64(_mm_add_epi64(x.v, hb), _mm_add_epi64(y.v, hb)));
+}
+
+#else
+KFR_INTRINSIC u64sse gt(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] > y[i]));
+}
+KFR_INTRINSIC i64sse gt(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] > y[i]));
+}
+KFR_INTRINSIC u64sse lt(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] < y[i]));
+}
+KFR_INTRINSIC i64sse lt(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] < y[i]));
+}
+KFR_INTRINSIC u64sse ge(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] >= y[i]));
+}
+KFR_INTRINSIC i64sse ge(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] >= y[i]));
+}
+KFR_INTRINSIC u64sse le(const u64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64sse, result[i] = internal::maskbits<u64>(x[i] <= y[i]));
+}
+KFR_INTRINSIC i64sse le(const i64sse& x, const i64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = internal::maskbits<i64>(x[i] <= y[i]));
+}
+#endif
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx add(const f32avx& x, const f32avx& y) { return f32avx(_mm256_add_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx add(const f64avx& x, const f64avx& y) { return f64avx(_mm256_add_pd(x.v, y.v)); }
+KFR_INTRINSIC f32avx sub(const f32avx& x, const f32avx& y) { return f32avx(_mm256_sub_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx sub(const f64avx& x, const f64avx& y) { return f64avx(_mm256_sub_pd(x.v, y.v)); }
+KFR_INTRINSIC f32avx mul(const f32avx& x, const f32avx& y) { return f32avx(_mm256_mul_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx mul(const f64avx& x, const f64avx& y) { return f64avx(_mm256_mul_pd(x.v, y.v)); }
+KFR_INTRINSIC f32avx div(const f32avx& x, const f32avx& y) { return f32avx(_mm256_div_ps(x.v, y.v)); }
+KFR_INTRINSIC f64avx div(const f64avx& x, const f64avx& y) { return f64avx(_mm256_div_pd(x.v, y.v)); }
+
+KFR_INTRINSIC __m256 _mm256_allones_ps()
+{
+ return _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_UQ);
+}
+
+KFR_INTRINSIC __m256d _mm256_allones_pd()
+{
+ return _mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_UQ);
+}
+
+#if defined CMT_ARCH_AVX2
+KFR_INTRINSIC __m256i _mm256_allones_si256()
+{
+ return _mm256_cmpeq_epi8(_mm256_setzero_si256(), _mm256_setzero_si256());
+}
+#else
+KFR_INTRINSIC __m256i _mm256_allones_si256()
+{
+ return _mm256_castps_si256(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_UQ));
+}
+#endif
+
+KFR_INTRINSIC __m256 _mm256_not_ps(const __m256& x) { return _mm256_xor_ps(x, _mm256_allones_ps()); }
+KFR_INTRINSIC __m256d _mm256_not_pd(const __m256d& x) { return _mm256_xor_pd(x, _mm256_allones_pd()); }
+KFR_INTRINSIC __m256i _mm256_not_si256(const __m256i& x)
+{
+ return _mm256_xor_si256(x, _mm256_allones_si256());
+}
+
+KFR_INTRINSIC __m256i _mm256_highbit_epi8() { return _mm256_set1_epi8(static_cast<char>(0x80)); }
+KFR_INTRINSIC __m256i _mm256_highbit_epi16() { return _mm256_set1_epi16(static_cast<short>(0x8000)); }
+KFR_INTRINSIC __m256i _mm256_highbit_epi32() { return _mm256_set1_epi32(static_cast<int>(0x80000000)); }
+KFR_INTRINSIC __m256i _mm256_highbit_epi64() { return _mm256_set1_epi64x(0x8000000000000000ll); }
+
+KFR_INTRINSIC f32avx eq(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_EQ_OQ); }
+KFR_INTRINSIC f64avx eq(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_EQ_OQ); }
+KFR_INTRINSIC f32avx ne(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_NEQ_OQ); }
+KFR_INTRINSIC f64avx ne(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_NEQ_OQ); }
+KFR_INTRINSIC f32avx lt(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_LT_OQ); }
+KFR_INTRINSIC f64avx lt(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_LT_OQ); }
+KFR_INTRINSIC f32avx gt(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_GT_OQ); }
+KFR_INTRINSIC f64avx gt(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_GT_OQ); }
+KFR_INTRINSIC f32avx le(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_LE_OQ); }
+KFR_INTRINSIC f64avx le(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_LE_OQ); }
+KFR_INTRINSIC f32avx ge(const f32avx& x, const f32avx& y) { return _mm256_cmp_ps(x.v, y.v, _CMP_GE_OQ); }
+KFR_INTRINSIC f64avx ge(const f64avx& x, const f64avx& y) { return _mm256_cmp_pd(x.v, y.v, _CMP_GE_OQ); }
+
+KFR_INTRINSIC f32avx band(const f32avx& x, const f32avx& y) { return _mm256_and_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx band(const f64avx& x, const f64avx& y) { return _mm256_and_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx bor(const f32avx& x, const f32avx& y) { return _mm256_or_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx bor(const f64avx& x, const f64avx& y) { return _mm256_or_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx bxor(const f32avx& x, const f32avx& y) { return _mm256_xor_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx bxor(const f64avx& x, const f64avx& y) { return _mm256_xor_pd(x.v, y.v); }
+
+KFR_INTRINSIC f32avx shl(const f32avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+ return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(x.v), y));
+#else
+ return _mm256_setr_m128(
+ _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(x.v)), y)),
+ _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(x.v, 1)), y)));
+#endif
+}
+KFR_INTRINSIC f64avx shl(const f64avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+ return _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(x.v), y));
+#else
+ return _mm256_setr_m128d(
+ _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(_mm256_castpd256_pd128(x.v)), y)),
+ _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(_mm256_extractf128_pd(x.v, 1)), y)));
+#endif
+}
+KFR_INTRINSIC f32avx shr(const f32avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+ return _mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(x.v), y));
+#else
+ return _mm256_setr_m128(
+ _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(_mm256_castps256_ps128(x.v)), y)),
+ _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(_mm256_extractf128_ps(x.v, 1)), y)));
+#endif
+}
+KFR_INTRINSIC f64avx shr(const f64avx& x, unsigned y)
+{
+#if defined CMT_ARCH_AVX2
+ return _mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(x.v), y));
+#else
+ return _mm256_setr_m128d(
+ _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(_mm256_castpd256_pd128(x.v)), y)),
+ _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(_mm256_extractf128_pd(x.v, 1)), y)));
+#endif
+}
+
+#if defined CMT_ARCH_AVX2
+
+KFR_INTRINSIC u8avx add(const u8avx& x, const u8avx& y) { return _mm256_add_epi8(x.v, y.v); }
+KFR_INTRINSIC u8avx sub(const u8avx& x, const u8avx& y) { return _mm256_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC u8avx div(const u8avx& x, const u8avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(u8avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC i8avx add(const i8avx& x, const i8avx& y) { return _mm256_add_epi8(x.v, y.v); }
+KFR_INTRINSIC i8avx sub(const i8avx& x, const i8avx& y) { return _mm256_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC i8avx div(const i8avx& x, const i8avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(i8avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC u16avx add(const u16avx& x, const u16avx& y) { return _mm256_add_epi16(x.v, y.v); }
+KFR_INTRINSIC u16avx sub(const u16avx& x, const u16avx& y) { return _mm256_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC u16avx mul(const u16avx& x, const u16avx& y) { return _mm256_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC u16avx div(const u16avx& x, const u16avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(u16avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC i16avx add(const i16avx& x, const i16avx& y) { return _mm256_add_epi16(x.v, y.v); }
+KFR_INTRINSIC i16avx sub(const i16avx& x, const i16avx& y) { return _mm256_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC i16avx mul(const i16avx& x, const i16avx& y) { return _mm256_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC i16avx div(const i16avx& x, const i16avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(i16avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC u32avx add(const u32avx& x, const u32avx& y) { return _mm256_add_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx sub(const u32avx& x, const u32avx& y) { return _mm256_sub_epi32(x.v, y.v); }
+
+KFR_INTRINSIC i32avx add(const i32avx& x, const i32avx& y) { return _mm256_add_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx sub(const i32avx& x, const i32avx& y) { return _mm256_sub_epi32(x.v, y.v); }
+
+KFR_INTRINSIC u32avx mul(const u32avx& x, const u32avx& y) { return _mm256_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx mul(const i32avx& x, const i32avx& y) { return _mm256_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC u32avx div(const u32avx& x, const u32avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(u32avx, result[i] = x[i] / y[i]);
+}
+KFR_INTRINSIC i32avx div(const i32avx& x, const i32avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(i32avx, result[i] = x[i] / y[i]);
+}
+
+KFR_INTRINSIC u64avx add(const u64avx& x, const u64avx& y) { return _mm256_add_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx sub(const u64avx& x, const u64avx& y) { return _mm256_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC u64avx mul(const u64avx& x, const u64avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64avx, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC u64avx div(const u64avx& x, const u64avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64avx, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i64avx add(const i64avx& x, const i64avx& y) { return _mm256_add_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx sub(const i64avx& x, const i64avx& y) { return _mm256_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx mul(const i64avx& x, const i64avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64avx, result[i] = x[i] * y[i]);
+}
+KFR_INTRINSIC i64avx div(const i64avx& x, const i64avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64avx, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC __m256i mul_epi8(const __m256i& x, const __m256i& y)
+{
+ const __m256i even = _mm256_mullo_epi16(x, y);
+ const __m256i odd = _mm256_mullo_epi16(_mm256_srli_epi16(x, 8), _mm256_srli_epi16(y, 8));
+ return _mm256_or_si256(_mm256_slli_epi16(odd, 8), _mm256_srli_epi16(_mm256_slli_epi16(even, 8), 8));
+}
+
+KFR_INTRINSIC u8avx mul(const u8avx& x, const u8avx& y) { return mul_epi8(x.v, y.v); }
+KFR_INTRINSIC i8avx mul(const i8avx& x, const i8avx& y) { return mul_epi8(x.v, y.v); }
+
+KFR_INTRINSIC u8avx band(const u8avx& x, const u8avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u16avx band(const u16avx& x, const u16avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u32avx band(const u32avx& x, const u32avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u64avx band(const u64avx& x, const u64avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i8avx band(const i8avx& x, const i8avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i16avx band(const i16avx& x, const i16avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i32avx band(const i32avx& x, const i32avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC i64avx band(const i64avx& x, const i64avx& y) { return _mm256_and_si256(x.v, y.v); }
+KFR_INTRINSIC u8avx bor(const u8avx& x, const u8avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u16avx bor(const u16avx& x, const u16avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u32avx bor(const u32avx& x, const u32avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u64avx bor(const u64avx& x, const u64avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i8avx bor(const i8avx& x, const i8avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i16avx bor(const i16avx& x, const i16avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i32avx bor(const i32avx& x, const i32avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC i64avx bor(const i64avx& x, const i64avx& y) { return _mm256_or_si256(x.v, y.v); }
+KFR_INTRINSIC u8avx bxor(const u8avx& x, const u8avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC u16avx bxor(const u16avx& x, const u16avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC u32avx bxor(const u32avx& x, const u32avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC u64avx bxor(const u64avx& x, const u64avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i8avx bxor(const i8avx& x, const i8avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i16avx bxor(const i16avx& x, const i16avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i32avx bxor(const i32avx& x, const i32avx& y) { return _mm256_xor_si256(x.v, y.v); }
+KFR_INTRINSIC i64avx bxor(const i64avx& x, const i64avx& y) { return _mm256_xor_si256(x.v, y.v); }
+
+KFR_INTRINSIC u16avx shl(const u16avx& x, unsigned y) { return _mm256_slli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx shl(const u32avx& x, unsigned y) { return _mm256_slli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx shl(const i16avx& x, unsigned y) { return _mm256_slli_epi16(x.v, y); }
+KFR_INTRINSIC i32avx shl(const i32avx& x, unsigned y) { return _mm256_slli_epi32(x.v, y); }
+KFR_INTRINSIC u16avx shr(const u16avx& x, unsigned y) { return _mm256_srli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx shr(const u32avx& x, unsigned y) { return _mm256_srli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx shr(const i16avx& x, unsigned y) { return _mm256_srai_epi16(x.v, y); }
+KFR_INTRINSIC i32avx shr(const i32avx& x, unsigned y) { return _mm256_srai_epi32(x.v, y); }
+
+KFR_INTRINSIC u64avx shl(const u64avx& x, unsigned y) { return _mm256_slli_epi64(x.v, y); }
+KFR_INTRINSIC u64avx shr(const u64avx& x, unsigned y) { return _mm256_srli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx shl(const i64avx& x, unsigned y) { return _mm256_slli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx shr(const i64avx& x, unsigned y)
+{
+ KFR_COMPONENTWISE_RET_I(u64avx, result[i] = x[i] >> y);
+}
+
+KFR_INTRINSIC u8avx shl(const u8avx& x, unsigned y)
+{
+ __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+ __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+ __m256i ll = _mm256_slli_epi16(l, y);
+ __m256i hh = _mm256_slli_epi16(h, y);
+
+ return _mm256_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx shl(const i8avx& x, unsigned y)
+{
+ __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+ __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+ __m256i ll = _mm256_slli_epi16(l, y);
+ __m256i hh = _mm256_slli_epi16(h, y);
+
+ return _mm256_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC u8avx shr(const u8avx& x, unsigned y)
+{
+ __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+ __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+ __m256i ll = _mm256_srli_epi16(l, y);
+ __m256i hh = _mm256_srli_epi16(h, y);
+
+ return _mm256_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx shr(const i8avx& x, unsigned y)
+{
+ __m256i l = _mm256_unpacklo_epi8(_mm256_setzero_si256(), x.v);
+ __m256i h = _mm256_unpackhi_epi8(_mm256_setzero_si256(), x.v);
+ __m256i ll = _mm256_srai_epi16(l, y);
+ __m256i hh = _mm256_srai_epi16(h, y);
+
+ return _mm256_packs_epi16(ll, hh);
+}
+
+KFR_INTRINSIC u32sse shl(const u32sse& x, const u32sse& y) { return _mm_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse shl(const i32sse& x, const u32sse& y) { return _mm_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC u64sse shl(const u64sse& x, const u64sse& y) { return _mm_sllv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse shl(const i64sse& x, const u64sse& y) { return _mm_sllv_epi64(x.v, y.v); }
+
+KFR_INTRINSIC u32avx shl(const u32avx& x, const u32avx& y) { return _mm256_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx shl(const i32avx& x, const u32avx& y) { return _mm256_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx shl(const u64avx& x, const u64avx& y) { return _mm256_sllv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx shl(const i64avx& x, const u64avx& y) { return _mm256_sllv_epi64(x.v, y.v); }
+
+KFR_INTRINSIC u32sse shr(const u32sse& x, const u32sse& y) { return _mm_srlv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32sse shr(const i32sse& x, const u32sse& y) { return _mm_srav_epi32(x.v, y.v); }
+KFR_INTRINSIC u64sse shr(const u64sse& x, const u64sse& y) { return _mm_srlv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64sse shr(const i64sse& x, const u64sse& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64sse, result[i] = x[i] >> y[i]);
+}
+
+KFR_INTRINSIC u32avx shr(const u32avx& x, const u32avx& y) { return _mm256_srlv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx shr(const i32avx& x, const u32avx& y) { return _mm256_srav_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx shr(const u64avx& x, const u64avx& y) { return _mm256_srlv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx shr(const i64avx& x, const u64avx& y)
+{
+ KFR_COMPONENTWISE_RET_I(i64avx, result[i] = x[i] >> y[i]);
+}
+
+KFR_INTRINSIC f32sse shl(const f32sse& x, const u32sse& y)
+{
+ return _mm_castsi128_ps(_mm_sllv_epi32(_mm_castps_si128(x.v), y.v));
+}
+KFR_INTRINSIC f64sse shl(const f64sse& x, const u64sse& y)
+{
+ return _mm_castsi128_pd(_mm_sllv_epi64(_mm_castpd_si128(x.v), y.v));
+}
+KFR_INTRINSIC f32sse shr(const f32sse& x, const u32sse& y)
+{
+ return _mm_castsi128_ps(_mm_srlv_epi32(_mm_castps_si128(x.v), y.v));
+}
+KFR_INTRINSIC f64sse shr(const f64sse& x, const u64sse& y)
+{
+ return _mm_castsi128_pd(_mm_srlv_epi64(_mm_castpd_si128(x.v), y.v));
+}
+
+KFR_INTRINSIC f32avx shl(const f32avx& x, const u32avx& y)
+{
+ return _mm256_castsi256_ps(_mm256_sllv_epi32(_mm256_castps_si256(x.v), y.v));
+}
+KFR_INTRINSIC f64avx shl(const f64avx& x, const u64avx& y)
+{
+ return _mm256_castsi256_pd(_mm256_sllv_epi64(_mm256_castpd_si256(x.v), y.v));
+}
+KFR_INTRINSIC f32avx shr(const f32avx& x, const u32avx& y)
+{
+ return _mm256_castsi256_ps(_mm256_srlv_epi32(_mm256_castps_si256(x.v), y.v));
+}
+KFR_INTRINSIC f64avx shr(const f64avx& x, const u64avx& y)
+{
+ return _mm256_castsi256_pd(_mm256_srlv_epi64(_mm256_castpd_si256(x.v), y.v));
+}
+
+KFR_INTRINSIC i8avx eq(const i8avx& x, const i8avx& y) { return _mm256_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx eq(const i16avx& x, const i16avx& y) { return _mm256_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx eq(const i32avx& x, const i32avx& y) { return _mm256_cmpeq_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx eq(const i64avx& x, const i64avx& y) { return _mm256_cmpeq_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx eq(const u8avx& x, const u8avx& y) { return _mm256_cmpeq_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx eq(const u16avx& x, const u16avx& y) { return _mm256_cmpeq_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx eq(const u32avx& x, const u32avx& y) { return _mm256_cmpeq_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx eq(const u64avx& x, const u64avx& y) { return _mm256_cmpeq_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx ne(const i8avx& x, const i8avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi8(x.v, y.v));
+}
+KFR_INTRINSIC i16avx ne(const i16avx& x, const i16avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi16(x.v, y.v));
+}
+KFR_INTRINSIC i32avx ne(const i32avx& x, const i32avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi32(x.v, y.v));
+}
+KFR_INTRINSIC i64avx ne(const i64avx& x, const i64avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi64(x.v, y.v));
+}
+KFR_INTRINSIC u8avx ne(const u8avx& x, const u8avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi8(x.v, y.v));
+}
+KFR_INTRINSIC u16avx ne(const u16avx& x, const u16avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi16(x.v, y.v));
+}
+KFR_INTRINSIC u32avx ne(const u32avx& x, const u32avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi32(x.v, y.v));
+}
+KFR_INTRINSIC u64avx ne(const u64avx& x, const u64avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpeq_epi64(x.v, y.v));
+}
+
+KFR_INTRINSIC i8avx lt(const i8avx& x, const i8avx& y) { return _mm256_cmpgt_epi8(y.v, x.v); }
+KFR_INTRINSIC i16avx lt(const i16avx& x, const i16avx& y) { return _mm256_cmpgt_epi16(y.v, x.v); }
+KFR_INTRINSIC i32avx lt(const i32avx& x, const i32avx& y) { return _mm256_cmpgt_epi32(y.v, x.v); }
+KFR_INTRINSIC i64avx lt(const i64avx& x, const i64avx& y) { return _mm256_cmpgt_epi64(y.v, x.v); }
+
+KFR_INTRINSIC i8avx gt(const i8avx& x, const i8avx& y) { return _mm256_cmpgt_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx gt(const i16avx& x, const i16avx& y) { return _mm256_cmpgt_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx gt(const i32avx& x, const i32avx& y) { return _mm256_cmpgt_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx gt(const i64avx& x, const i64avx& y) { return _mm256_cmpgt_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx le(const i8avx& x, const i8avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi8(x.v, y.v));
+}
+KFR_INTRINSIC i16avx le(const i16avx& x, const i16avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi16(x.v, y.v));
+}
+KFR_INTRINSIC i32avx le(const i32avx& x, const i32avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi32(x.v, y.v));
+}
+KFR_INTRINSIC i64avx le(const i64avx& x, const i64avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi64(x.v, y.v));
+}
+
+KFR_INTRINSIC i8avx ge(const i8avx& x, const i8avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi8(y.v, x.v));
+}
+KFR_INTRINSIC i16avx ge(const i16avx& x, const i16avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi16(y.v, x.v));
+}
+KFR_INTRINSIC i32avx ge(const i32avx& x, const i32avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi32(y.v, x.v));
+}
+KFR_INTRINSIC i64avx ge(const i64avx& x, const i64avx& y)
+{
+ return _mm256_not_si256(_mm256_cmpgt_epi64(y.v, x.v));
+}
+
+KFR_INTRINSIC u8avx lt(const u8avx& x, const u8avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi8();
+ return _mm256_cmpgt_epi8(_mm256_add_epi8(y.v, hb), _mm256_add_epi8(x.v, hb));
+}
+KFR_INTRINSIC u16avx lt(const u16avx& x, const u16avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi16();
+ return _mm256_cmpgt_epi16(_mm256_add_epi16(y.v, hb), _mm256_add_epi16(x.v, hb));
+}
+KFR_INTRINSIC u32avx lt(const u32avx& x, const u32avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi32();
+ return _mm256_cmpgt_epi32(_mm256_add_epi32(y.v, hb), _mm256_add_epi32(x.v, hb));
+}
+KFR_INTRINSIC u64avx lt(const u64avx& x, const u64avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi64();
+ return _mm256_cmpgt_epi64(_mm256_add_epi64(y.v, hb), _mm256_add_epi64(x.v, hb));
+}
+KFR_INTRINSIC u8avx gt(const u8avx& x, const u8avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi8();
+ return _mm256_cmpgt_epi8(_mm256_add_epi8(x.v, hb), _mm256_add_epi8(y.v, hb));
+}
+KFR_INTRINSIC u16avx gt(const u16avx& x, const u16avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi16();
+ return _mm256_cmpgt_epi16(_mm256_add_epi16(x.v, hb), _mm256_add_epi16(y.v, hb));
+}
+KFR_INTRINSIC u32avx gt(const u32avx& x, const u32avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi32();
+ return _mm256_cmpgt_epi32(_mm256_add_epi32(x.v, hb), _mm256_add_epi32(y.v, hb));
+}
+KFR_INTRINSIC u64avx gt(const u64avx& x, const u64avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi64();
+ return _mm256_cmpgt_epi64(_mm256_add_epi64(x.v, hb), _mm256_add_epi64(y.v, hb));
+}
+KFR_INTRINSIC u8avx le(const u8avx& x, const u8avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi8();
+ return _mm256_not_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(x.v, hb), _mm256_add_epi8(y.v, hb)));
+}
+KFR_INTRINSIC u16avx le(const u16avx& x, const u16avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi16();
+ return _mm256_not_si256(_mm256_cmpgt_epi16(_mm256_add_epi16(x.v, hb), _mm256_add_epi16(y.v, hb)));
+}
+KFR_INTRINSIC u32avx le(const u32avx& x, const u32avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi32();
+ return _mm256_not_si256(_mm256_cmpgt_epi32(_mm256_add_epi32(x.v, hb), _mm256_add_epi32(y.v, hb)));
+}
+KFR_INTRINSIC u64avx le(const u64avx& x, const u64avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi64();
+ return _mm256_not_si256(_mm256_cmpgt_epi64(_mm256_add_epi64(x.v, hb), _mm256_add_epi64(y.v, hb)));
+}
+KFR_INTRINSIC u8avx ge(const u8avx& x, const u8avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi8();
+ return _mm256_not_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(y.v, hb), _mm256_add_epi8(x.v, hb)));
+}
+KFR_INTRINSIC u16avx ge(const u16avx& x, const u16avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi16();
+ return _mm256_not_si256(_mm256_cmpgt_epi16(_mm256_add_epi16(y.v, hb), _mm256_add_epi16(x.v, hb)));
+}
+KFR_INTRINSIC u32avx ge(const u32avx& x, const u32avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi32();
+ return _mm256_not_si256(_mm256_cmpgt_epi32(_mm256_add_epi32(y.v, hb), _mm256_add_epi32(x.v, hb)));
+}
+KFR_INTRINSIC u64avx ge(const u64avx& x, const u64avx& y)
+{
+ const __m256i hb = _mm256_highbit_epi64();
+ return _mm256_not_si256(_mm256_cmpgt_epi64(_mm256_add_epi64(y.v, hb), _mm256_add_epi64(x.v, hb)));
+}
+
+#if defined CMT_ARCH_AVX512
+KFR_INTRINSIC f32avx512 add(const f32avx512& x, const f32avx512& y) { return _mm512_add_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 add(const f64avx512& x, const f64avx512& y) { return _mm512_add_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 sub(const f32avx512& x, const f32avx512& y) { return _mm512_sub_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 sub(const f64avx512& x, const f64avx512& y) { return _mm512_sub_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 mul(const f32avx512& x, const f32avx512& y) { return _mm512_mul_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 mul(const f64avx512& x, const f64avx512& y) { return _mm512_mul_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 div(const f32avx512& x, const f32avx512& y) { return _mm512_div_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 div(const f64avx512& x, const f64avx512& y) { return _mm512_div_pd(x.v, y.v); }
+
+KFR_INTRINSIC __m512 _mm512_allones_ps()
+{
+ return _mm512_castsi512_ps(_mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(),
+ _mm512_setzero_si512(), 0xFF));
+}
+
+KFR_INTRINSIC __m512d _mm512_allones_pd()
+{
+ return _mm512_castsi512_pd(_mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(),
+ _mm512_setzero_si512(), 0xFF));
+}
+
+KFR_INTRINSIC __m512i _mm512_allones_si512()
+{
+ return _mm512_ternarylogic_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(),
+ 0xFF);
+}
+
+KFR_INTRINSIC __m512 _mm512_not_ps(const __m512& x) { return _mm512_xor_ps(x, _mm512_allones_ps()); }
+KFR_INTRINSIC __m512d _mm512_not_pd(const __m512d& x) { return _mm512_xor_pd(x, _mm512_allones_pd()); }
+KFR_INTRINSIC __m512i _mm512_not_si512(const __m512i& x)
+{
+ return _mm512_xor_si512(x, _mm512_allones_si512());
+}
+
+KFR_INTRINSIC __m512i _mm512_highbit_epi8() { return _mm512_set1_epi8(static_cast<char>(0x80)); }
+KFR_INTRINSIC __m512i _mm512_highbit_epi16() { return _mm512_set1_epi16(static_cast<short>(0x8000)); }
+KFR_INTRINSIC __m512i _mm512_highbit_epi32() { return _mm512_set1_epi32(static_cast<int>(0x80000000)); }
+KFR_INTRINSIC __m512i _mm512_highbit_epi64() { return _mm512_set1_epi64(0x8000000000000000ll); }
+
+KFR_INTRINSIC f32avx512 eq(const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_EQ_OQ)));
+}
+KFR_INTRINSIC f64avx512 eq(const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_EQ_OQ)));
+}
+KFR_INTRINSIC f32avx512 ne(const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_NEQ_OQ)));
+}
+KFR_INTRINSIC f64avx512 ne(const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_NEQ_OQ)));
+}
+KFR_INTRINSIC f32avx512 lt(const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_LT_OQ)));
+}
+KFR_INTRINSIC f64avx512 lt(const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_LT_OQ)));
+}
+KFR_INTRINSIC f32avx512 gt(const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_GT_OQ)));
+}
+KFR_INTRINSIC f64avx512 gt(const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_GT_OQ)));
+}
+KFR_INTRINSIC f32avx512 le(const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_LE_OQ)));
+}
+KFR_INTRINSIC f64avx512 le(const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_LE_OQ)));
+}
+KFR_INTRINSIC f32avx512 ge(const f32avx512& x, const f32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(x.v, y.v, _CMP_GE_OQ)));
+}
+KFR_INTRINSIC f64avx512 ge(const f64avx512& x, const f64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_movm_epi64(_mm512_cmp_pd_mask(x.v, y.v, _CMP_GE_OQ)));
+}
+
+KFR_INTRINSIC f32avx512 band(const f32avx512& x, const f32avx512& y) { return _mm512_and_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 band(const f64avx512& x, const f64avx512& y) { return _mm512_and_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 bor(const f32avx512& x, const f32avx512& y) { return _mm512_or_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 bor(const f64avx512& x, const f64avx512& y) { return _mm512_or_pd(x.v, y.v); }
+KFR_INTRINSIC f32avx512 bxor(const f32avx512& x, const f32avx512& y) { return _mm512_xor_ps(x.v, y.v); }
+KFR_INTRINSIC f64avx512 bxor(const f64avx512& x, const f64avx512& y) { return _mm512_xor_pd(x.v, y.v); }
+
+#if 1
+#define KFR_knot_mask8(x) ((__mmask8)(~((u8)(x))))
+#define KFR_knot_mask16(x) ((__mmask16)(~((u16)(x))))
+#define KFR_knot_mask32(x) ((__mmask32)(~((u32)(x))))
+#define KFR_knot_mask64(x) ((__mmask64)(~((u64)(x))))
+#else
+#define KFR_knot_mask8(x) _knot_mask8(x)
+#define KFR_knot_mask16(x) _knot_mask16(x)
+#define KFR_knot_mask32(x) _knot_mask32(x)
+#define KFR_knot_mask64(x) _knot_mask64(x)
+#endif
+
+KFR_INTRINSIC i8avx512 eq(const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_movm_epi8(_mm512_cmpeq_epi8_mask(x.v, y.v));
+}
+KFR_INTRINSIC i16avx512 eq(const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_movm_epi16(_mm512_cmpeq_epi16_mask(x.v, y.v));
+}
+KFR_INTRINSIC i32avx512 eq(const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(x.v, y.v));
+}
+KFR_INTRINSIC i64avx512 eq(const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_movm_epi64(_mm512_cmpeq_epi64_mask(x.v, y.v));
+}
+KFR_INTRINSIC i8avx512 ne(const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmpeq_epi8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i16avx512 ne(const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmpeq_epi16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i32avx512 ne(const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmpeq_epi32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i64avx512 ne(const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmpeq_epi64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i8avx512 ge(const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epi8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i16avx512 ge(const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epi16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i32avx512 ge(const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epi32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i64avx512 ge(const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epi64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC i8avx512 lt(const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(x.v, y.v));
+}
+KFR_INTRINSIC i16avx512 lt(const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(x.v, y.v));
+}
+KFR_INTRINSIC i32avx512 lt(const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_movm_epi32(_mm512_cmplt_epi32_mask(x.v, y.v));
+}
+KFR_INTRINSIC i64avx512 lt(const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_movm_epi64(_mm512_cmplt_epi64_mask(x.v, y.v));
+}
+KFR_INTRINSIC i8avx512 le(const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epi8_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i16avx512 le(const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epi16_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i32avx512 le(const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epi32_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i64avx512 le(const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epi64_mask(y.v, x.v)));
+}
+KFR_INTRINSIC i8avx512 gt(const i8avx512& x, const i8avx512& y)
+{
+ return _mm512_movm_epi8(_mm512_cmplt_epi8_mask(y.v, x.v));
+}
+KFR_INTRINSIC i16avx512 gt(const i16avx512& x, const i16avx512& y)
+{
+ return _mm512_movm_epi16(_mm512_cmplt_epi16_mask(y.v, x.v));
+}
+KFR_INTRINSIC i32avx512 gt(const i32avx512& x, const i32avx512& y)
+{
+ return _mm512_movm_epi32(_mm512_cmplt_epi32_mask(y.v, x.v));
+}
+KFR_INTRINSIC i64avx512 gt(const i64avx512& x, const i64avx512& y)
+{
+ return _mm512_movm_epi64(_mm512_cmplt_epi64_mask(y.v, x.v));
+}
+
+KFR_INTRINSIC u8avx512 eq(const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_movm_epi8(_mm512_cmpeq_epu8_mask(x.v, y.v));
+}
+KFR_INTRINSIC u16avx512 eq(const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_movm_epi16(_mm512_cmpeq_epu16_mask(x.v, y.v));
+}
+KFR_INTRINSIC u32avx512 eq(const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_movm_epi32(_mm512_cmpeq_epu32_mask(x.v, y.v));
+}
+KFR_INTRINSIC u64avx512 eq(const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_movm_epi64(_mm512_cmpeq_epu64_mask(x.v, y.v));
+}
+KFR_INTRINSIC u8avx512 ne(const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmpeq_epu8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u16avx512 ne(const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmpeq_epu16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u32avx512 ne(const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmpeq_epu32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u64avx512 ne(const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmpeq_epu64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u8avx512 ge(const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epu8_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u16avx512 ge(const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epu16_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u32avx512 ge(const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epu32_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u64avx512 ge(const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epu64_mask(x.v, y.v)));
+}
+KFR_INTRINSIC u8avx512 lt(const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(x.v, y.v));
+}
+KFR_INTRINSIC u16avx512 lt(const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(x.v, y.v));
+}
+KFR_INTRINSIC u32avx512 lt(const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_movm_epi32(_mm512_cmplt_epu32_mask(x.v, y.v));
+}
+KFR_INTRINSIC u64avx512 lt(const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_movm_epi64(_mm512_cmplt_epu64_mask(x.v, y.v));
+}
+KFR_INTRINSIC u8avx512 le(const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_movm_epi8(KFR_knot_mask64(_mm512_cmplt_epu8_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u16avx512 le(const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_movm_epi16(KFR_knot_mask32(_mm512_cmplt_epu16_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u32avx512 le(const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_movm_epi32(KFR_knot_mask16(_mm512_cmplt_epu32_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u64avx512 le(const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_movm_epi64(KFR_knot_mask8(_mm512_cmplt_epu64_mask(y.v, x.v)));
+}
+KFR_INTRINSIC u8avx512 gt(const u8avx512& x, const u8avx512& y)
+{
+ return _mm512_movm_epi8(_mm512_cmplt_epu8_mask(y.v, x.v));
+}
+KFR_INTRINSIC u16avx512 gt(const u16avx512& x, const u16avx512& y)
+{
+ return _mm512_movm_epi16(_mm512_cmplt_epu16_mask(y.v, x.v));
+}
+KFR_INTRINSIC u32avx512 gt(const u32avx512& x, const u32avx512& y)
+{
+ return _mm512_movm_epi32(_mm512_cmplt_epu32_mask(y.v, x.v));
+}
+KFR_INTRINSIC u64avx512 gt(const u64avx512& x, const u64avx512& y)
+{
+ return _mm512_movm_epi64(_mm512_cmplt_epu64_mask(y.v, x.v));
+}
+
+KFR_INTRINSIC i8avx512 add(const i8avx512& x, const i8avx512& y) { return _mm512_add_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 add(const i16avx512& x, const i16avx512& y) { return _mm512_add_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 add(const i32avx512& x, const i32avx512& y) { return _mm512_add_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 add(const i64avx512& x, const i64avx512& y) { return _mm512_add_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx512 add(const u8avx512& x, const u8avx512& y) { return _mm512_add_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 add(const u16avx512& x, const u16avx512& y) { return _mm512_add_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx512 add(const u32avx512& x, const u32avx512& y) { return _mm512_add_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 add(const u64avx512& x, const u64avx512& y) { return _mm512_add_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 sub(const i8avx512& x, const i8avx512& y) { return _mm512_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 sub(const i16avx512& x, const i16avx512& y) { return _mm512_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 sub(const i32avx512& x, const i32avx512& y) { return _mm512_sub_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 sub(const i64avx512& x, const i64avx512& y) { return _mm512_sub_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx512 sub(const u8avx512& x, const u8avx512& y) { return _mm512_sub_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 sub(const u16avx512& x, const u16avx512& y) { return _mm512_sub_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx512 sub(const u32avx512& x, const u32avx512& y) { return _mm512_sub_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 sub(const u64avx512& x, const u64avx512& y) { return _mm512_sub_epi64(x.v, y.v); }
+
+KFR_INTRINSIC __m512i mul_epi8(const __m512i& x, const __m512i& y)
+{
+ const __m512i even = _mm512_mullo_epi16(x, y);
+ const __m512i odd = _mm512_mullo_epi16(_mm512_srli_epi16(x, 8), _mm512_srli_epi16(y, 8));
+ return _mm512_or_si512(_mm512_slli_epi16(odd, 8), _mm512_srli_epi16(_mm512_slli_epi16(even, 8), 8));
+}
+
+KFR_INTRINSIC i8avx512 mul(const i8avx512& x, const i8avx512& y) { return mul_epi8(x.v, y.v); }
+KFR_INTRINSIC i16avx512 mul(const i16avx512& x, const i16avx512& y) { return _mm512_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC i32avx512 mul(const i32avx512& x, const i32avx512& y) { return _mm512_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC i64avx512 mul(const i64avx512& x, const i64avx512& y) { return _mm512_mullo_epi64(x.v, y.v); }
+KFR_INTRINSIC u8avx512 mul(const u8avx512& x, const u8avx512& y) { return mul_epi8(x.v, y.v); }
+KFR_INTRINSIC u16avx512 mul(const u16avx512& x, const u16avx512& y) { return _mm512_mullo_epi16(x.v, y.v); }
+KFR_INTRINSIC u32avx512 mul(const u32avx512& x, const u32avx512& y) { return _mm512_mullo_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 mul(const u64avx512& x, const u64avx512& y) { return _mm512_mullo_epi64(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 div(const i8avx512& x, const i8avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u8avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i16avx512 div(const i16avx512& x, const i16avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u16avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i32avx512 div(const i32avx512& x, const i32avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u32avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC i64avx512 div(const i64avx512& x, const i64avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u8avx512 div(const u8avx512& x, const u8avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u8avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u16avx512 div(const u16avx512& x, const u16avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u16avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u32avx512 div(const u32avx512& x, const u32avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u32avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+KFR_INTRINSIC u64avx512 div(const u64avx512& x, const u64avx512& y)
+{
+ KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = y[i] ? x[i] / y[i] : 0);
+}
+
+KFR_INTRINSIC i8avx512 band(const i8avx512& x, const i8avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC i16avx512 band(const i16avx512& x, const i16avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC i32avx512 band(const i32avx512& x, const i32avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC i64avx512 band(const i64avx512& x, const i64avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u8avx512 band(const u8avx512& x, const u8avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u16avx512 band(const u16avx512& x, const u16avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u32avx512 band(const u32avx512& x, const u32avx512& y) { return _mm512_and_si512(x.v, y.v); }
+KFR_INTRINSIC u64avx512 band(const u64avx512& x, const u64avx512& y) { return _mm512_and_si512(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 bor(const i8avx512& x, const i8avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC i16avx512 bor(const i16avx512& x, const i16avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC i32avx512 bor(const i32avx512& x, const i32avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC i64avx512 bor(const i64avx512& x, const i64avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u8avx512 bor(const u8avx512& x, const u8avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u16avx512 bor(const u16avx512& x, const u16avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u32avx512 bor(const u32avx512& x, const u32avx512& y) { return _mm512_or_si512(x.v, y.v); }
+KFR_INTRINSIC u64avx512 bor(const u64avx512& x, const u64avx512& y) { return _mm512_or_si512(x.v, y.v); }
+
+KFR_INTRINSIC i8avx512 bxor(const i8avx512& x, const i8avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC i16avx512 bxor(const i16avx512& x, const i16avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC i32avx512 bxor(const i32avx512& x, const i32avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC i64avx512 bxor(const i64avx512& x, const i64avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u8avx512 bxor(const u8avx512& x, const u8avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u16avx512 bxor(const u16avx512& x, const u16avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u32avx512 bxor(const u32avx512& x, const u32avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+KFR_INTRINSIC u64avx512 bxor(const u64avx512& x, const u64avx512& y) { return _mm512_xor_si512(x.v, y.v); }
+
+KFR_INTRINSIC f32avx512 shl(const f32avx512& x, unsigned y)
+{
+ return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(x.v), y));
+}
+KFR_INTRINSIC f64avx512 shl(const f64avx512& x, unsigned y)
+{
+ return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(x.v), y));
+}
+KFR_INTRINSIC f32avx512 shr(const f32avx512& x, unsigned y)
+{
+ return _mm512_castsi512_ps(_mm512_srli_epi32(_mm512_castps_si512(x.v), y));
+}
+KFR_INTRINSIC f64avx512 shr(const f64avx512& x, unsigned y)
+{
+ return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(x.v), y));
+}
+
+KFR_INTRINSIC u16avx512 shl(const u16avx512& x, unsigned y) { return _mm512_slli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx512 shl(const u32avx512& x, unsigned y) { return _mm512_slli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx512 shl(const i16avx512& x, unsigned y) { return _mm512_slli_epi16(x.v, y); }
+KFR_INTRINSIC i32avx512 shl(const i32avx512& x, unsigned y) { return _mm512_slli_epi32(x.v, y); }
+KFR_INTRINSIC u16avx512 shr(const u16avx512& x, unsigned y) { return _mm512_srli_epi16(x.v, y); }
+KFR_INTRINSIC u32avx512 shr(const u32avx512& x, unsigned y) { return _mm512_srli_epi32(x.v, y); }
+KFR_INTRINSIC i16avx512 shr(const i16avx512& x, unsigned y) { return _mm512_srai_epi16(x.v, y); }
+KFR_INTRINSIC i32avx512 shr(const i32avx512& x, unsigned y) { return _mm512_srai_epi32(x.v, y); }
+
+KFR_INTRINSIC u64avx512 shl(const u64avx512& x, unsigned y) { return _mm512_slli_epi64(x.v, y); }
+KFR_INTRINSIC u64avx512 shr(const u64avx512& x, unsigned y) { return _mm512_srli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx512 shl(const i64avx512& x, unsigned y) { return _mm512_slli_epi64(x.v, y); }
+KFR_INTRINSIC i64avx512 shr(const i64avx512& x, unsigned y)
+{
+ KFR_COMPONENTWISE_RET_I(u64avx512, result[i] = x[i] >> y);
+}
+
+KFR_INTRINSIC u8avx512 shl(const u8avx512& x, unsigned y)
+{
+ __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+ __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+ __m512i ll = _mm512_slli_epi16(l, y);
+ __m512i hh = _mm512_slli_epi16(h, y);
+
+ return _mm512_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx512 shl(const i8avx512& x, unsigned y)
+{
+ __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+ __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+ __m512i ll = _mm512_slli_epi16(l, y);
+ __m512i hh = _mm512_slli_epi16(h, y);
+
+ return _mm512_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC u8avx512 shr(const u8avx512& x, unsigned y)
+{
+ __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+ __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+ __m512i ll = _mm512_srli_epi16(l, y);
+ __m512i hh = _mm512_srli_epi16(h, y);
+
+ return _mm512_packs_epi16(ll, hh);
+}
+KFR_INTRINSIC i8avx512 shr(const i8avx512& x, unsigned y)
+{
+ __m512i l = _mm512_unpacklo_epi8(_mm512_setzero_si512(), x.v);
+ __m512i h = _mm512_unpackhi_epi8(_mm512_setzero_si512(), x.v);
+ __m512i ll = _mm512_srai_epi16(l, y);
+ __m512i hh = _mm512_srai_epi16(h, y);
+
+ return _mm512_packs_epi16(ll, hh);
+}
+
+KFR_INTRINSIC u32avx512 shl(const u32avx512& x, const u32avx512& y) { return _mm512_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx512 shl(const i32avx512& x, const u32avx512& y) { return _mm512_sllv_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 shl(const u64avx512& x, const u64avx512& y) { return _mm512_sllv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 shl(const i64avx512& x, const u64avx512& y) { return _mm512_sllv_epi64(x.v, y.v); }
+
+KFR_INTRINSIC u32avx512 shr(const u32avx512& x, const u32avx512& y) { return _mm512_srlv_epi32(x.v, y.v); }
+KFR_INTRINSIC i32avx512 shr(const i32avx512& x, const u32avx512& y) { return _mm512_srav_epi32(x.v, y.v); }
+KFR_INTRINSIC u64avx512 shr(const u64avx512& x, const u64avx512& y) { return _mm512_srlv_epi64(x.v, y.v); }
+KFR_INTRINSIC i64avx512 shr(const i64avx512& x, const u64avx512& y) { return _mm512_srav_epi64(x.v, y.v); }
+
+KFR_INTRINSIC f32avx512 shl(const f32avx512& x, const u32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_sllv_epi32(_mm512_castps_si512(x.v), y.v));
+}
+KFR_INTRINSIC f64avx512 shl(const f64avx512& x, const u64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(x.v), y.v));
+}
+KFR_INTRINSIC f32avx512 shr(const f32avx512& x, const u32avx512& y)
+{
+ return _mm512_castsi512_ps(_mm512_srlv_epi32(_mm512_castps_si512(x.v), y.v));
+}
+KFR_INTRINSIC f64avx512 shr(const f64avx512& x, const u64avx512& y)
+{
+ return _mm512_castsi512_pd(_mm512_srlv_epi64(_mm512_castpd_si512(x.v), y.v));
+}
+
+#endif
+
+#endif
+
+#endif
+
+#define KFR_HANDLE_ALL_SIZES_SHIFT_2(fn) \
+ template <typename T, size_t N, \
+ KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const unsigned b) \
+ { \
+ return slice<0, N>(fn(expand_simd(a), b)); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \
+ typename = void> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const unsigned b) \
+ { \
+ return concat(fn(low(a), b), fn(high(a), b)); \
+ }
+#define KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(fn) \
+ template <typename T, size_t N, \
+ KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<utype<T>, N>& b) \
+ { \
+ return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \
+ typename = void> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<utype<T>, N>& b) \
+ { \
+ return concat(fn(low(a), low(b)), fn(high(a), high(b))); \
+ }
+
+KFR_HANDLE_ALL_SIZES_2(add)
+KFR_HANDLE_ALL_SIZES_2(sub)
+KFR_HANDLE_ALL_SIZES_2(mul)
+KFR_HANDLE_ALL_SIZES_2(div)
+
+KFR_HANDLE_ALL_SIZES_2(eq)
+KFR_HANDLE_ALL_SIZES_2(ne)
+KFR_HANDLE_ALL_SIZES_2(lt)
+KFR_HANDLE_ALL_SIZES_2(gt)
+KFR_HANDLE_ALL_SIZES_2(le)
+KFR_HANDLE_ALL_SIZES_2(ge)
+
+KFR_HANDLE_ALL_SIZES_2(band)
+KFR_HANDLE_ALL_SIZES_2(bor)
+KFR_HANDLE_ALL_SIZES_2(bxor)
+
+KFR_HANDLE_ALL_SIZES_SHIFT_2(shl)
+KFR_HANDLE_ALL_SIZES_SHIFT_2(shr)
+KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(shl)
+KFR_HANDLE_ALL_SIZES_SHIFT_VAR_2(shr)
+
+#else
+
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y[i])));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shl(const vec<T, N>& x, unsigned y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) << y)));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, const vec<utype<T>, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y[i])));
+}
+template <typename T, size_t N, typename = decltype(uibitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> shr(const vec<T, N>& x, unsigned y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<uitype<T>>(uibitcast(x[i]) >> y)));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> eq(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] == y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ne(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] != y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> ge(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] >= y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> le(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] <= y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> gt(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] > y[i]));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> lt(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = internal::maskbits<T>(x[i] < y[i]));
+}
+
+template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bor(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>((ubitcast(x[i]) | ubitcast(y[i])))));
+}
+template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> bxor(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>(ubitcast(x[i]) ^ ubitcast(y[i]))));
+}
+template <typename T, size_t N, typename = decltype(ubitcast(T())), KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> band(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = bitcast<T>(static_cast<utype<T>>(ubitcast(x[i]) & ubitcast(y[i]))));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> add(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = x[i] + y[i]);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> sub(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = x[i] - y[i]);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> mul(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = x[i] * y[i]);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)>
+KFR_INTRINSIC vec<T, N> div(const vec<T, N>& x, const vec<T, N>& y)
+{
+ KFR_COMPONENTWISE_RET(result[i] = x[i] / y[i]);
+}
+
+#define KFR_HANDLE_VEC_SCA(fn) \
+ template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& x, const T& y) \
+ { \
+ return fn(x, vec<T, N>(y)); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const T& x, const vec<T, N>& y) \
+ { \
+ return fn(vec<T, N>(x), y); \
+ }
+
+KFR_HANDLE_VEC_SCA(add)
+KFR_HANDLE_VEC_SCA(sub)
+KFR_HANDLE_VEC_SCA(mul)
+KFR_HANDLE_VEC_SCA(div)
+KFR_HANDLE_VEC_SCA(band)
+KFR_HANDLE_VEC_SCA(bor)
+KFR_HANDLE_VEC_SCA(bxor)
+KFR_HANDLE_VEC_SCA(eq)
+KFR_HANDLE_VEC_SCA(ne)
+KFR_HANDLE_VEC_SCA(lt)
+KFR_HANDLE_VEC_SCA(gt)
+KFR_HANDLE_VEC_SCA(le)
+KFR_HANDLE_VEC_SCA(ge)
+
+#endif
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> bnot(const vec<T, N>& x)
+{
+ return bxor(special_constants<T>::allones(), x);
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x)
+{
+ return sub(T(0), x);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_INTRINSIC vec<T, N> neg(const vec<T, N>& x)
+{
+ return bxor(special_constants<T>::highbitmask(), x);
+}
+
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/simd/impl/function.hpp b/include/kfr/simd/impl/function.hpp
@@ -0,0 +1,295 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../../base/expression.hpp"
+#include "../shuffle.hpp"
+#include "../types.hpp"
+#include "../vec.hpp"
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+#define KFR_HANDLE_NOT_F_1(fn) \
+ template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> \
+ KFR_INTRINSIC vec<flt_type<T>, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \
+ { \
+ return intrinsics::fn(elemcast<flt_type<T>>(a)); \
+ }
+
+#define KFR_HANDLE_SCALAR(fn) \
+ template <typename T1, typename... Args, typename Tout = ::kfr::common_type<T1, Args...>, \
+ KFR_ENABLE_IF(!or_t<is_vec<T1>, is_vec<Args>...>::value)> \
+ KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \
+ { \
+ using vecout = vec1<Tout>; \
+ return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \
+ }
+
+#define KFR_HANDLE_SCALAR_1_T(fn, Tout) \
+ template <typename T1, typename... Args, typename T = ::kfr::common_type<T1, Args...>, \
+ KFR_ENABLE_IF(!or_t<is_vec<T1>, is_vec<Args>...>::value)> \
+ KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \
+ { \
+ using vecout = vec1<Tout>; \
+ return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \
+ }
+
+#define KFR_HANDLE_ARGS_T(fn, Tout) \
+ template <typename T1, typename... Args, typename T = ::kfr::common_type<T1, Args...>, \
+ KFR_ENABLE_IF(or_t<is_vec<T1>, is_vec<Args>...>::value)> \
+ KFR_INTRINSIC Tout fn(const T1& a, const Args&... b) CMT_NOEXCEPT \
+ { \
+ using vecout = vec1<Tout>; \
+ return to_scalar(::kfr::intrinsics::fn(vecout(a), vecout(b)...)); \
+ }
+
+namespace intrinsics
+{
+#ifdef CMT_ARCH_X86
+using f32sse = vec<f32, 4>;
+using f64sse = vec<f64, 2>;
+using i8sse = vec<i8, 16>;
+using i16sse = vec<i16, 8>;
+using i32sse = vec<i32, 4>;
+using i64sse = vec<i64, 2>;
+using u8sse = vec<u8, 16>;
+using u16sse = vec<u16, 8>;
+using u32sse = vec<u32, 4>;
+using u64sse = vec<u64, 2>;
+
+using f32avx = vec<f32, 8>;
+using f64avx = vec<f64, 4>;
+using i8avx = vec<i8, 32>;
+using i16avx = vec<i16, 16>;
+using i32avx = vec<i32, 8>;
+using i64avx = vec<i64, 4>;
+using u8avx = vec<u8, 32>;
+using u16avx = vec<u16, 16>;
+using u32avx = vec<u32, 8>;
+using u64avx = vec<u64, 4>;
+
+using f32avx512 = vec<f32, 16>;
+using f64avx512 = vec<f64, 8>;
+using i8avx512 = vec<i8, 64>;
+using i16avx512 = vec<i16, 32>;
+using i32avx512 = vec<i32, 16>;
+using i64avx512 = vec<i64, 8>;
+using u8avx512 = vec<u8, 64>;
+using u16avx512 = vec<u16, 32>;
+using u32avx512 = vec<u32, 16>;
+using u64avx512 = vec<u64, 8>;
+
+#else
+using f32neon = vec<f32, 4>;
+using f64neon = vec<f64, 2>;
+using i8neon = vec<i8, 16>;
+using i16neon = vec<i16, 8>;
+using i32neon = vec<i32, 4>;
+using i64neon = vec<i64, 2>;
+using u8neon = vec<u8, 16>;
+using u16neon = vec<u16, 8>;
+using u32neon = vec<u32, 4>;
+using u64neon = vec<u64, 2>;
+#endif
+
+template <typename T>
+constexpr inline size_t next_simd_width(size_t n) CMT_NOEXCEPT
+{
+ return n < minimum_vector_width<T> ? minimum_vector_width<T> : next_poweroftwo(n);
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
+KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, 1>& x) CMT_NOEXCEPT
+{
+ return broadcast<Nout>(x);
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
+KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x) CMT_NOEXCEPT
+{
+ return extend<Nout>(x);
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<T>(N)>
+KFR_INTRINSIC vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) CMT_NOEXCEPT
+{
+ return widen<Nout>(x, value);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
+ Fn&& fn)
+{
+ result = fn(a, b, c);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
+ Fn&& fn)
+{
+ intrin(result.h.low, a.h.low, b.h.low, c.h.low, fn);
+ intrin(result.h.high, a.h.high, b.h.high, c.h.high, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn)
+{
+ result = fn(a);
+}
+
+template <typename T, size_t Nvec = vector_width<T>, size_t N, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, Fn&& fn)
+{
+ intrin(result.h.low, a.h.low, fn);
+ intrin(result.h.high, a.h.high, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn)
+{
+ result = fn(a, b);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const vec<T, N>& b, Fn&& fn)
+{
+ intrin(result.h.low, a.h.low, b.h.low, fn);
+ intrin(result.h.high, a.h.high, b.h.high, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn)
+{
+ result = fn(a, b);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const vec<T, N>& a, const T& b, Fn&& fn)
+{
+ intrin(result.h.low, a.h.low, b, fn);
+ intrin(result.h.high, a.h.high, b, fn);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N <= Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn)
+{
+ result = fn(a, b);
+}
+
+template <typename T, size_t N, size_t Nvec = vector_width<T>, typename Fn, KFR_ENABLE_IF(N > Nvec)>
+KFR_INTRINSIC void intrin(vec<T, N>& result, const T& a, const vec<T, N>& b, Fn&& fn)
+{
+ intrin(result.h.low, a, b.h.low, fn);
+ intrin(result.h.high, a, b.h.high, fn);
+}
+
+#define KFR_HANDLE_ALL_SIZES_1_IF(fn, cond) \
+ template <typename T, size_t N, \
+ KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value && cond)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \
+ { \
+ constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \
+ return intrinsics::fn(a.shuffle(csizeseq<Nout>)).shuffle(csizeseq<N>); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value && cond), \
+ typename = void> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a) CMT_NOEXCEPT \
+ { \
+ vec<T, N> r; \
+ intrin(r, a, [](const auto& x) { return intrinsics::fn(x); }); \
+ return r; \
+ }
+
+#define KFR_HANDLE_ALL_SIZES_1(fn) KFR_HANDLE_ALL_SIZES_1_IF(fn, true)
+
+#define KFR_HANDLE_ALL_SIZES_2(fn) \
+ template <typename T, size_t N, \
+ KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT \
+ { \
+ constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \
+ return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), b.shuffle(csizeseq_t<Nout>())) \
+ .shuffle(csizeseq<N>); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \
+ typename = void> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) CMT_NOEXCEPT \
+ { \
+ vec<T, N> r; \
+ intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); }); \
+ return r; \
+ } \
+ template <typename T, size_t N, \
+ KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT \
+ { \
+ constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \
+ return intrinsics::fn(a.shuffle(csizeseq_t<Nout>()), vec<T, Nout>(b)).shuffle(csizeseq<N>); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \
+ typename = void> \
+ KFR_INTRINSIC vec<T, N> fn(const vec<T, N>& a, const T& b) CMT_NOEXCEPT \
+ { \
+ vec<T, N> r; \
+ intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); }); \
+ return r; \
+ } \
+ template <typename T, size_t N, \
+ KFR_ENABLE_IF(N < vector_width<T> && !is_simd_size<T>(N) && is_simd_type<T>::value)> \
+ KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT \
+ { \
+ constexpr size_t Nout = intrinsics::next_simd_width<T>(N); \
+ return intrinsics::fn(vec<T, Nout>(a), b.shuffle(csizeseq_t<Nout>())).shuffle(csizeseq<N>); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N > vector_width<T> && is_simd_type<T>::value), \
+ typename = void> \
+ KFR_INTRINSIC vec<T, N> fn(const T& a, const vec<T, N>& b) CMT_NOEXCEPT \
+ { \
+ vec<T, N> r; \
+ intrin(r, a, b, [](const auto& a, const auto& b) { return intrinsics::fn(a, b); }); \
+ return r; \
+ }
+
+template <typename T>
+using vec1 = conditional<is_vec<T>::value, T, vec<T, 1>>;
+
+template <typename T>
+inline const T& to_scalar(const T& value) CMT_NOEXCEPT
+{
+ return value;
+}
+template <typename T>
+inline T to_scalar(const vec<T, 1>& value) CMT_NOEXCEPT
+{
+ return value[0];
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/impl/intrinsics.h b/include/kfr/simd/impl/intrinsics.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "../../cident.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef CMT_ARCH_SSE2
+#include <immintrin.h>
+#ifdef CMT_OS_WIN
+#include <intrin.h>
+#endif
+#endif
+
+#ifdef CMT_ARCH_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined CMT_COMPILER_GCC && defined CMT_ARCH_X86
+#include <x86intrin.h>
+#endif
+
+#ifdef CMT_COMPILER_CLANG
+#define builtin_addressof(x) __builtin_addressof(x)
+#else
+template <class T>
+inline T* builtin_addressof(T& arg)
+{
+ return reinterpret_cast<T*>(&const_cast<char&>(reinterpret_cast<const volatile char&>(arg)));
+}
+#endif
+
+#ifdef CMT_COMPILER_GNU
+CMT_INLINE float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
+CMT_INLINE double builtin_sqrt(double x) { return __builtin_sqrt(x); }
+CMT_INLINE long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
+CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size)
+{
+ __builtin_memcpy(dest, src, size);
+}
+CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { __builtin_memset(dest, val, size); }
+#else
+CMT_INLINE float builtin_sqrt(float x) { return ::sqrtf(x); }
+CMT_INLINE double builtin_sqrt(double x) { return ::sqrt(x); }
+CMT_INLINE long double builtin_sqrt(long double x) { return ::sqrtl(x); }
+CMT_INLINE void builtin_memcpy(void* dest, const void* src, size_t size) { ::memcpy(dest, src, size); }
+CMT_INLINE void builtin_memset(void* dest, int val, size_t size) { ::memset(dest, val, size); }
+#endif
+
+#define KFR_ENABLE_IF CMT_ENABLE_IF
diff --git a/include/kfr/simd/impl/operators.hpp b/include/kfr/simd/impl/operators.hpp
@@ -0,0 +1,164 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "function.hpp"
+
+#ifdef CMT_CLANG_EXT
+#include "basicoperators_clang.hpp"
+#else
+#include "basicoperators_generic.hpp"
+#endif
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> neg(const vec<complex<T>, N>& x)
+{
+ return neg(x.flatten()).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> add(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ return add(x.flatten(), y.flatten()).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> sub(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ return sub(x.flatten(), y.flatten()).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> mul(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ const vec<T, (N * 2)> xx = x.v;
+ const vec<T, (N * 2)> yy = y.v;
+ return subadd(mul(xx, dupeven(yy)), mul(swap<2>(xx), dupodd(yy))).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> div(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ const vec<T, (N * 2)> xx = x.v;
+ const vec<T, (N * 2)> yy = y.v;
+ const vec<T, (N * 2)> m = (add(sqr(dupeven(yy)), sqr(dupodd(yy))));
+ return swap<2>(subadd(mul(swap<2>(xx), dupeven(yy)), mul(xx, dupodd(yy))) / m).v;
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> bor(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ return bor(x.flatten(), y.flatten()).v;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> bxor(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ return bxor(x.flatten(), y.flatten()).v;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<complex<T>, N> band(const vec<complex<T>, N>& x, const vec<complex<T>, N>& y)
+{
+ return band(x.flatten(), y.flatten()).v;
+}
+
+#define KFR_COMPLEX_OP_CVT(fn) \
+ template <typename T, size_t N> \
+ KFR_INTRINSIC vec<complex<T>, N> fn(const vec<complex<T>, N>& x, const complex<T>& y) \
+ { \
+ return fn(x, vec<complex<T>, N>(y)); \
+ } \
+ template <typename T, size_t N> \
+ KFR_INTRINSIC vec<complex<T>, N> fn(const complex<T>& x, const vec<complex<T>, N>& y) \
+ { \
+ return fn(vec<complex<T>, N>(x), y); \
+ }
+
+KFR_COMPLEX_OP_CVT(mul)
+KFR_COMPLEX_OP_CVT(div)
+KFR_COMPLEX_OP_CVT(band)
+KFR_COMPLEX_OP_CVT(bxor)
+KFR_COMPLEX_OP_CVT(bor)
+
+#define KFR_VECVEC_OP1(fn) \
+ template <typename T1, size_t N1, size_t N2> \
+ KFR_INTRINSIC vec<vec<T1, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x) \
+ { \
+ return fn(x.flatten()).v; \
+ }
+
+#define KFR_VECVEC_OP2(fn) \
+ template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \
+ KFR_ENABLE_IF(is_simd_type<C>::value)> \
+ KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const vec<vec<T2, N1>, N2>& y) \
+ { \
+ return fn(innercast<C>(x.flatten()), innercast<C>(y.flatten())).v; \
+ } \
+ template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \
+ KFR_ENABLE_IF(is_simd_type<C>::value)> \
+ KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const T2& y) \
+ { \
+ return fn(innercast<C>(x.flatten()), innercast<C>(y)).v; \
+ } \
+ template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \
+ KFR_ENABLE_IF(is_simd_type<C>::value)> \
+ KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<vec<T1, N1>, N2>& x, const vec<T2, N1>& y) \
+ { \
+ return fn(innercast<C>(x.flatten()), repeat<N2>(innercast<C>(y.flatten()))).v; \
+ } \
+ template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \
+ KFR_ENABLE_IF(is_simd_type<C>::value)> \
+ KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const T1& x, const vec<vec<T2, N1>, N2>& y) \
+ { \
+ return fn(innercast<C>(x), innercast<C>(y.flatten())).v; \
+ } \
+ template <typename T1, typename T2, size_t N1, size_t N2, typename C = common_type<T1, T2>, \
+ KFR_ENABLE_IF(is_simd_type<C>::value)> \
+ KFR_INTRINSIC vec<vec<C, N1>, N2> fn(const vec<T1, N1>& x, const vec<vec<T2, N1>, N2>& y) \
+ { \
+ return fn(repeat<N2>(innercast<C>(x.flatten())), innercast<C>(y.flatten())).v; \
+ }
+
+KFR_VECVEC_OP1(neg)
+KFR_VECVEC_OP1(bnot)
+KFR_VECVEC_OP2(add)
+KFR_VECVEC_OP2(sub)
+KFR_VECVEC_OP2(mul)
+KFR_VECVEC_OP2(div)
+KFR_VECVEC_OP2(band)
+KFR_VECVEC_OP2(bor)
+KFR_VECVEC_OP2(bxor)
+
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/simd.hpp b/include/kfr/simd/impl/simd.hpp
@@ -0,0 +1,183 @@
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../platform.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+#if defined CMT_COMPILER_GNU
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("0xFFFFFFFF"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("0xFFFFFFFFFFFFFFFF"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("0x7FFFFFFF"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("0x7FFFFFFFFFFFFFFF"); }
+#elif defined CMT_COMPILER_MSVC
+constexpr f32 allones_f32() CMT_NOEXCEPT { return -__builtin_nanf("-1"); }
+constexpr f64 allones_f64() CMT_NOEXCEPT { return -__builtin_nan("-1"); }
+constexpr f32 invhighbit_f32() CMT_NOEXCEPT { return __builtin_nanf("-1"); }
+constexpr f64 invhighbit_f64() CMT_NOEXCEPT { return __builtin_nan("-1"); }
+#else
+inline f32 allones_f32() CMT_NOEXCEPT
+{
+ return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0xFFFFFFFFu)));
+}
+inline f64 allones_f64() CMT_NOEXCEPT
+{
+ return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0xFFFFFFFFFFFFFFFFull)));
+}
+inline f32 invhighbit_f32() CMT_NOEXCEPT
+{
+ return _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(0x7FFFFFFFu)));
+}
+inline f64 invhighbit_f64() CMT_NOEXCEPT
+{
+ return _mm_cvtsd_f64(_mm_castsi128_pd(_mm_cvtsi64x_si128(0x7FFFFFFFFFFFFFFFull)));
+}
+#endif
+
+template <typename T>
+struct special_scalar_constants
+{
+ constexpr static T highbitmask() { return static_cast<T>(1ull << (sizeof(T) * 8 - 1)); }
+ constexpr static T allones() { return static_cast<T>(-1ll); }
+ constexpr static T allzeros() { return T(0); }
+ constexpr static T invhighbitmask() { return static_cast<T>((1ull << (sizeof(T) * 8 - 1)) - 1); }
+};
+
+#ifndef CMT_COMPILER_INTEL
+#define KFR_CONSTEXPR_NON_INTEL constexpr
+#else
+#define KFR_CONSTEXPR_NON_INTEL
+#endif
+
+template <>
+struct special_scalar_constants<float>
+{
+ constexpr static float highbitmask() { return -0.f; }
+ KFR_CONSTEXPR_NON_INTEL static float allones() noexcept { return allones_f32(); };
+ constexpr static float allzeros() { return 0.f; }
+ KFR_CONSTEXPR_NON_INTEL static float invhighbitmask() { return invhighbit_f32(); }
+};
+
+template <>
+struct special_scalar_constants<double>
+{
+ constexpr static double highbitmask() { return -0.; }
+ KFR_CONSTEXPR_NON_INTEL static double allones() noexcept { return allones_f64(); };
+ constexpr static double allzeros() { return 0.; }
+ KFR_CONSTEXPR_NON_INTEL static double invhighbitmask() { return invhighbit_f64(); }
+};
+
+template <typename T>
+struct special_constants : public special_scalar_constants<subtype<T>>
+{
+public:
+ using Tsub = subtype<T>;
+};
+
+namespace intrinsics
+{
+
+template <typename T, size_t N>
+struct simd_t
+{
+ using value_type = T;
+
+ constexpr static size_t size() { return N; }
+};
+
+template <typename T, size_t N1, size_t N2>
+struct simd2_t
+{
+ using value_type = T;
+
+ constexpr static size_t size1() { return N1; }
+
+ constexpr static size_t size2() { return N2; }
+};
+
+template <typename Tout, typename Tin, size_t N>
+struct simd_cvt_t
+{
+ using value_type_out = Tout;
+ using value_type_in = Tin;
+
+ constexpr static size_t size() { return N; }
+};
+
+template <typename T, size_t N>
+constexpr size_t alignment()
+{
+ return const_min(size_t(platform<>::native_vector_alignment), next_poweroftwo(sizeof(T) * N));
+}
+
+template <typename T, size_t N>
+struct alignas(alignment<T, N>()) simd_array
+{
+ T val[next_poweroftwo(N)];
+};
+
+template <typename T, size_t N>
+struct simd_type;
+
+template <typename T>
+struct simd_type<T, 0>
+{
+ // SFINAE
+};
+
+template <typename T, size_t N>
+struct simd_halves
+{
+ using subtype = typename simd_type<T, prev_poweroftwo(N - 1)>::type;
+
+ subtype low;
+ subtype high;
+#if KFR_DEFINE_CTORS_FOR_HALVES
+ simd_halves() CMT_NOEXCEPT {}
+ simd_halves(const subtype& l, const subtype& h) CMT_NOEXCEPT : low(l), high(h) {}
+ simd_halves(const simd_halves& v) CMT_NOEXCEPT : low(v.low), high(v.high) {}
+ simd_halves(simd_halves&& v) CMT_NOEXCEPT : low(v.low), high(v.high) {}
+
+ simd_halves& operator=(const simd_halves& v) CMT_NOEXCEPT
+ {
+ low = v.low;
+ high = v.high;
+ return *this;
+ }
+ simd_halves& operator=(simd_halves&& v) CMT_NOEXCEPT
+ {
+ low = v.low;
+ high = v.high;
+ return *this;
+ }
+#endif
+};
+
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/impl/specializations.i b/include/kfr/simd/impl/specializations.i
@@ -0,0 +1,116 @@
+/**
+ * Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+ * This file is part of KFR
+ *
+ * KFR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KFR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with KFR.
+ */
+#pragma once
+
+#include "../vec.hpp"
+#ifndef KFR_SHUFFLE_SPECIALIZATIONS
+#include "../shuffle.hpp"
+#endif
+
+#ifdef KFR_COMPILER_GNU
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+namespace intrinsics
+{
+template <>
+inline vec<f32, 32> shufflevector<f32, 32>(
+ csizes_t<0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14,
+ 15, 22, 23, 30, 31>,
+ const vec<f32, 32>& x, const vec<f32, 32>&)
+{
+ f32x32 w = x;
+
+ w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(low(w)),
+ permute<0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15>(high(w)));
+
+ w = permutegroups<(4), 0, 4, 2, 6, 1, 5, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
+ return w;
+}
+
+template <>
+inline vec<f32, 32> shufflevector<f32, 32>(
+ csizes_t<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18, 19, 10, 11, 26, 27, 6, 7, 22,
+ 23, 14, 15, 30, 31>,
+ const vec<f32, 32>& x, const vec<f32, 32>&)
+{
+ f32x32 w = x;
+
+ w = concat(permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(even<8>(w)),
+ permute<0, 1, 8, 9, 4, 5, 12, 13, /**/ 2, 3, 10, 11, 6, 7, 14, 15>(odd<8>(w)));
+
+ w = permutegroups<(4), 0, 4, 1, 5, 2, 6, 3, 7>(w); // avx: vperm2f128 & vinsertf128, sse: no-op
+ return w;
+}
+
+inline vec<f32, 32> bitreverse_2(const vec<f32, 32>& x)
+{
+ return shufflevector<f32, 32>(csizes<0, 1, 16, 17, 8, 9, 24, 25, 4, 5, 20, 21, 12, 13, 28, 29, 2, 3, 18,
+ 19, 10, 11, 26, 27, 6, 7, 22, 23, 14, 15, 30, 31>,
+ x, x);
+}
+
+template <>
+inline vec<f32, 64> shufflevector<f32, 64>(
+ csizes_t<0, 1, 32, 33, 16, 17, 48, 49, 8, 9, 40, 41, 24, 25, 56, 57, 4, 5, 36, 37, 20, 21, 52, 53, 12, 13,
+ 44, 45, 28, 29, 60, 61, 2, 3, 34, 35, 18, 19, 50, 51, 10, 11, 42, 43, 26, 27, 58, 59, 6, 7, 38,
+ 39, 22, 23, 54, 55, 14, 15, 46, 47, 30, 31, 62, 63>,
+ const vec<f32, 64>& x, const vec<f32, 64>&)
+{
+ return permutegroups<(8), 0, 4, 1, 5, 2, 6, 3, 7>(
+ concat(bitreverse_2(even<8>(x)), bitreverse_2(odd<8>(x))));
+}
+
+template <>
+inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15>,
+ const vec<f32, 16>& x, const vec<f32, 16>&)
+{
+ // asm volatile("int $3");
+ const vec<f32, 16> xx = permutegroups<(4), 0, 2, 1, 3>(x);
+
+ return concat(shuffle<0, 2, 8 + 0, 8 + 2>(low(xx), high(xx)),
+ shuffle<1, 3, 8 + 1, 8 + 3>(low(xx), high(xx)));
+}
+
+template <>
+inline vec<f32, 16> shufflevector<f32, 16>(csizes_t<0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15>,
+ const vec<f32, 16>& x, const vec<f32, 16>&)
+{
+ const vec<f32, 16> xx =
+ concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
+
+ return permutegroups<(4), 0, 2, 1, 3>(xx);
+}
+
+template <>
+inline vec<f32, 32> shufflevector<f32, 32>(
+ csizes_t<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13,
+ 29, 14, 30, 15, 31>,
+ const vec<f32, 32>& x, const vec<f32, 32>&)
+{
+ const vec<f32, 32> xx = permutegroups<(8), 0, 2, 1, 3>(x);
+
+ return concat(interleavehalfs(low(xx)), interleavehalfs(high(xx)));
+}
+} // namespace intrinsics
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+#endif
diff --git a/include/kfr/simd/mask.hpp b/include/kfr/simd/mask.hpp
@@ -0,0 +1,155 @@
+/** @addtogroup logical
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "vec.hpp"
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T>
+using maskfor = typename T::mask_t;
+
+namespace internal
+{
+
+template <typename T>
+constexpr inline T maskbits(bool value)
+{
+ return value ? special_constants<T>::allones() : special_constants<T>::allzeros();
+}
+} // namespace internal
+
+template <typename T, size_t N>
+struct mask : protected vec<T, N>
+{
+ using base = vec<T, N>;
+
+ KFR_MEM_INTRINSIC mask() CMT_NOEXCEPT = default;
+
+ KFR_MEM_INTRINSIC mask(const mask&) CMT_NOEXCEPT = default;
+
+ KFR_MEM_INTRINSIC mask& operator=(const mask&) CMT_NOEXCEPT = default;
+
+ using simd_type = typename base::simd_type;
+
+ KFR_MEM_INTRINSIC mask(bool arg) : base(internal::maskbits<T>(arg)) {}
+
+ template <typename... Args>
+ KFR_MEM_INTRINSIC mask(bool arg1, bool arg2, Args... args)
+ : base(internal::maskbits<T>(arg1), internal::maskbits<T>(arg2),
+ internal::maskbits<T>(static_cast<bool>(args))...)
+ {
+ }
+
+ using vec<T, N>::v;
+
+ KFR_MEM_INTRINSIC mask(const base& v) CMT_NOEXCEPT;
+
+ KFR_MEM_INTRINSIC mask(const simd_type& simd) : base(simd) {}
+
+ template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
+ KFR_MEM_INTRINSIC mask(const mask<U, N>& m) : base(base::frombits(m.asvec()))
+ {
+ }
+
+ template <typename U, KFR_ENABLE_IF(sizeof(T) != sizeof(U))>
+ KFR_MEM_INTRINSIC mask(const mask<U, N>& m)
+ : base(base::frombits(innercast<itype<T>>(vec<itype<U>, N>::frombits(m.asvec()))))
+ {
+ }
+
+ KFR_MEM_INTRINSIC bool operator[](size_t index) const CMT_NOEXCEPT;
+
+ KFR_MEM_INTRINSIC constexpr base asvec() const CMT_NOEXCEPT { return base(v); }
+};
+
+namespace internal
+{
+
+template <typename T, size_t Nout, size_t N1, size_t... indices>
+constexpr vec<T, Nout> partial_mask_helper(csizes_t<indices...>)
+{
+ return make_vector(maskbits<T>(indices < N1)...);
+}
+
+template <typename T, size_t Nout, size_t N1>
+constexpr vec<T, Nout> partial_mask()
+{
+ return internal::partial_mask_helper<T, Nout, N1>(csizeseq_t<Nout>());
+}
+} // namespace internal
+
+template <typename T, size_t N>
+KFR_MEM_INTRINSIC bool mask<T, N>::operator[](size_t index) const CMT_NOEXCEPT
+{
+ return ibitcast(base::operator[](index)) < 0;
+}
+
+template <typename T, typename... Args, size_t Nout = (sizeof...(Args) + 1)>
+constexpr KFR_INTRINSIC mask<T, Nout> make_mask(bool arg, Args... args)
+{
+ return vec<T, Nout>(internal::maskbits<T>(arg), internal::maskbits<T>(static_cast<bool>(args))...);
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+
+namespace cometa
+{
+
+template <typename T, size_t N>
+struct compound_type_traits<kfr::mask<T, N>>
+{
+ using subtype = T;
+ using deep_subtype = cometa::deep_subtype<T>;
+ constexpr static size_t width = N;
+ constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+ constexpr static bool is_scalar = false;
+ constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
+ template <typename U>
+ using rebind = kfr::mask<U, N>;
+ template <typename U>
+ using deep_rebind = kfr::mask<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
+
+ KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::mask<T, N>& value, size_t index)
+ {
+ return value[index];
+ }
+};
+} // namespace cometa
+
+namespace std
+{
+template <typename T1, typename T2, size_t N>
+struct common_type<kfr::mask<T1, N>, kfr::mask<T2, N>>
+{
+ using type = kfr::mask<typename common_type<T1, T2>::type, N>;
+};
+} // namespace std
diff --git a/include/kfr/simd/operators.hpp b/include/kfr/simd/operators.hpp
@@ -0,0 +1,810 @@
+/** @addtogroup basic_math
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "impl/operators.hpp"
+#include "mask.hpp"
+#include <algorithm>
+#include <utility>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+#define KFR_VEC_OPERATOR1(op, fn) \
+ template <typename T, size_t N> \
+ constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x) \
+ { \
+ return intrinsics::fn(x); \
+ }
+
+#define KFR_VEC_OPERATOR2(op, asgnop, fn) \
+ template <typename T1, typename T2, size_t N> \
+ constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const vec<T2, N>& y) \
+ { \
+ x = intrinsics::fn(x, elemcast<T1>(y)); \
+ return x; \
+ } \
+ template <typename T1, typename T2, size_t N> \
+ constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const T2& y) \
+ { \
+ x = intrinsics::fn(x, T1(y)); \
+ return x; \
+ } \
+ template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \
+ constexpr KFR_INTRINSIC vec<C, N> operator op(const vec<T1, N>& x, const T2& y) \
+ { \
+ return intrinsics::fn(elemcast<C>(x), C(y)); \
+ } \
+ template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \
+ constexpr KFR_INTRINSIC vec<C, N> operator op(const T1& x, const vec<T2, N>& y) \
+ { \
+ return intrinsics::fn(C(x), elemcast<C>(y)); \
+ } \
+ template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \
+ constexpr KFR_INTRINSIC vec<C, N> operator op(const vec<T1, N>& x, const vec<T2, N>& y) \
+ { \
+ return intrinsics::fn(elemcast<C>(x), elemcast<C>(y)); \
+ }
+
+#define KFR_VEC_SHIFT_OPERATOR(op, asgnop, fn) \
+ template <typename T1, size_t N> \
+ constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, unsigned y) \
+ { \
+ x = intrinsics::fn(x, y); \
+ return x; \
+ } \
+ template <typename T1, typename T2, size_t N> \
+ constexpr KFR_INTRINSIC vec<T1, N>& operator asgnop(vec<T1, N>& x, const vec<T2, N>& y) \
+ { \
+ x = intrinsics::fn(x, elemcast<utype<T1>>(y)); \
+ return x; \
+ } \
+ template <typename T, size_t N> \
+ constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x, unsigned y) \
+ { \
+ return intrinsics::fn(x, y); \
+ } \
+ template <typename T, typename T2, size_t N> \
+ constexpr KFR_INTRINSIC vec<T, N> operator op(const T& x, const vec<T2, N>& y) \
+ { \
+ return intrinsics::fn(innercast<T>(x), elemcast<utype<T>>(y)); \
+ } \
+ template <typename T, typename T2, size_t N> \
+ constexpr KFR_INTRINSIC vec<T, N> operator op(const vec<T, N>& x, const vec<T2, N>& y) \
+ { \
+ return intrinsics::fn(x, elemcast<utype<T>>(y)); \
+ }
+
+#define KFR_VEC_CMP_OPERATOR(op, fn) \
+ template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \
+ constexpr KFR_INTRINSIC mask<C, N> operator op(const vec<T1, N>& x, const T2& y) \
+ { \
+ return intrinsics::fn(elemcast<C>(x), vec<C, N>(y)).asmask(); \
+ } \
+ template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \
+ constexpr KFR_INTRINSIC mask<C, N> operator op(const T1& x, const vec<T2, N>& y) \
+ { \
+ return intrinsics::fn(vec<C, N>(x), elemcast<C>(y)).asmask(); \
+ } \
+ template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>> \
+ constexpr KFR_INTRINSIC mask<C, N> operator op(const vec<T1, N>& x, const vec<T2, N>& y) \
+ { \
+ return intrinsics::fn(elemcast<C>(x), elemcast<C>(y)).asmask(); \
+ }
+
+KFR_VEC_OPERATOR1(-, neg)
+KFR_VEC_OPERATOR1(~, bnot)
+
+KFR_VEC_OPERATOR2(+, +=, add)
+KFR_VEC_OPERATOR2(-, -=, sub)
+KFR_VEC_OPERATOR2(*, *=, mul)
+KFR_VEC_OPERATOR2(/, /=, div)
+
+KFR_VEC_OPERATOR2(&, &=, band)
+KFR_VEC_OPERATOR2(|, |=, bor)
+KFR_VEC_OPERATOR2 (^, ^=, bxor)
+KFR_VEC_SHIFT_OPERATOR(<<, <<=, shl)
+KFR_VEC_SHIFT_OPERATOR(>>, >>=, shr)
+
+KFR_VEC_CMP_OPERATOR(==, eq)
+KFR_VEC_CMP_OPERATOR(!=, ne)
+KFR_VEC_CMP_OPERATOR(>=, ge)
+KFR_VEC_CMP_OPERATOR(<=, le)
+KFR_VEC_CMP_OPERATOR(>, gt)
+KFR_VEC_CMP_OPERATOR(<, lt)
+
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+ KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator&(const mask<T1, N>& x, const mask<T2, N>& y)CMT_NOEXCEPT
+{
+ return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) & bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+ KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator|(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+ return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) | bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+ KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator&&(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+ return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) & bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+ KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator||(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+ return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) | bitcast<C>(vec<T2, N>(y.v))).v);
+}
+template <typename T1, typename T2, size_t N, typename C = common_type<T1, T2>,
+ KFR_ENABLE_IF(sizeof(T1) == sizeof(T2))>
+KFR_INTRINSIC mask<C, N> operator^(const mask<T1, N>& x, const mask<T2, N>& y) CMT_NOEXCEPT
+{
+ return mask<C, N>((bitcast<C>(vec<T1, N>(x.v)) ^ bitcast<C>(vec<T2, N>(y.v))).v);
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> operator~(const mask<T, N>& x) CMT_NOEXCEPT
+{
+ return ~x.asvec();
+}
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> operator!(const mask<T, N>& x) CMT_NOEXCEPT
+{
+ return ~x.asvec();
+}
+
+KFR_INTRINSIC float bitwisenot(float x) { return fbitcast(~ubitcast(x)); }
+KFR_INTRINSIC float bitwiseor(float x, float y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
+KFR_INTRINSIC float bitwiseand(float x, float y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
+KFR_INTRINSIC float bitwiseandnot(float x, float y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
+KFR_INTRINSIC float bitwisexor(float x, float y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
+KFR_INTRINSIC double bitwisenot(double x) { return fbitcast(~ubitcast(x)); }
+KFR_INTRINSIC double bitwiseor(double x, double y) { return fbitcast(ubitcast(x) | ubitcast(y)); }
+KFR_INTRINSIC double bitwiseand(double x, double y) { return fbitcast(ubitcast(x) & ubitcast(y)); }
+KFR_INTRINSIC double bitwiseandnot(double x, double y) { return fbitcast(ubitcast(x) & ~ubitcast(y)); }
+KFR_INTRINSIC double bitwisexor(double x, double y) { return fbitcast(ubitcast(x) ^ ubitcast(y)); }
+
+/// @brief Bitwise Not
+template <typename T1>
+KFR_INTRINSIC T1 bitwisenot(const T1& x)
+{
+ return ~x;
+}
+KFR_FN(bitwisenot)
+
+/// @brief Bitwise And
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwiseand(const T1& x, const T2& y)
+{
+ return x & y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T bitwiseand(initialvalue<T>)
+{
+ return constants<T>::allones();
+}
+KFR_FN(bitwiseand)
+
+/// @brief Bitwise And-Not
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwiseandnot(const T1& x, const T2& y)
+{
+ return x & ~y;
+}
+template <typename T>
+constexpr inline T bitwiseandnot(initialvalue<T>)
+{
+ return constants<T>::allones();
+}
+KFR_FN(bitwiseandnot)
+
+/// @brief Bitwise Or
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwiseor(const T1& x, const T2& y)
+{
+ return x | y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T bitwiseor(initialvalue<T>)
+{
+ return subtype<T>(0);
+}
+KFR_FN(bitwiseor)
+
+/// @brief Bitwise Xor (Exclusive Or)
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> bitwisexor(const T1& x, const T2& y)
+{
+ return x ^ y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T bitwisexor(initialvalue<T>)
+{
+ return subtype<T>();
+}
+KFR_FN(bitwisexor)
+
+/// @brief Bitwise Left shift
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 shl(const T1& left, const T2& right)
+{
+ return left << right;
+}
+KFR_FN(shl)
+
+/// @brief Bitwise Right shift
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 shr(const T1& left, const T2& right)
+{
+ return left >> right;
+}
+KFR_FN(shr)
+
+/// @brief Bitwise Left Rotate
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 rol(const T1& left, const T2& right)
+{
+ return shl(left, right) | shr(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
+}
+KFR_FN(rol)
+
+/// @brief Bitwise Right Rotate
+template <typename T1, typename T2>
+KFR_INTRINSIC T1 ror(const T1& left, const T2& right)
+{
+ return shr(left, right) | shl(left, (static_cast<subtype<T1>>(typebits<T1>::bits) - right));
+}
+KFR_FN(ror)
+
+template <typename T>
+constexpr KFR_INTRINSIC T add(const T& x)
+{
+ return x;
+}
+
+/**
+ * @brief Returns sum of all the arguments passed to a function.
+ */
+template <typename T1, typename T2, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, T2, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, T2, Ts...> add(const T1& x, const T2& y, const Ts&... rest)
+{
+ return x + add(y, rest...);
+}
+template <typename T>
+constexpr KFR_INTRINSIC T add(initialvalue<T>)
+{
+ return T(0);
+}
+KFR_FN(add)
+
+/**
+ * @brief Returns template expression that returns sum of all the arguments passed to a function.
+ */
+template <typename... E, KFR_ENABLE_IF((is_input_expressions<E...>::value) && true)>
+KFR_INTRINSIC internal::expression_function<fn::add, E...> add(E&&... x)
+{
+ return { fn::add(), std::forward<E>(x)... };
+}
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> sub(const T1& x, const T2& y)
+{
+ return x - y;
+}
+template <typename T>
+constexpr KFR_INTRINSIC T sub(initialvalue<T>)
+{
+ return T(0);
+}
+KFR_FN(sub)
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sub, E1, E2> sub(E1&& x, E2&& y)
+{
+ return { fn::sub(), std::forward<E1>(x), std::forward<E2>(y) };
+}
+
+template <typename T1>
+constexpr KFR_INTRINSIC T1 mul(const T1& x)
+{
+ return x;
+}
+
+/**
+ * @brief Returns product of all the arguments passed to a function.
+ */
+template <typename T1, typename T2, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, Ts...> mul(const T1& x, const T2& y, const Ts&... rest)
+{
+ return x * mul(y, rest...);
+}
+
+template <typename T>
+constexpr KFR_INTRINSIC T mul(initialvalue<T>)
+{
+ return T(1);
+}
+KFR_FN(mul)
+
+/**
+ * @brief Returns template expression that returns product of all the arguments passed to a function.
+ */
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mul, E...> mul(E&&... x)
+{
+ return { fn::mul(), std::forward<E>(x)... };
+}
+
+/**
+ * @brief Returns square of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+constexpr inline T1 sqr(const T1& x)
+{
+ return x * x;
+}
+KFR_FN(sqr)
+
+/**
+ * @brief Returns template expression that returns square of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sqr, E1> sqr(E1&& x)
+{
+ return { fn::sqr(), std::forward<E1>(x) };
+}
+
+/**
+ * @brief Returns cube of x.
+ */
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>::value)>
+constexpr inline T1 cub(const T1& x)
+{
+ return sqr(x) * x;
+}
+KFR_FN(cub)
+
+/**
+ * @brief Returns template expression that returns cube of x.
+ */
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::cub, E1> cub(E1&& x)
+{
+ return { fn::cub(), std::forward<E1>(x) };
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow2(const T& x)
+{
+ return sqr(x);
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow3(const T& x)
+{
+ return cub(x);
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow4(const T& x)
+{
+ return sqr(sqr(x));
+}
+
+template <typename T, KFR_ENABLE_IF(is_numeric_args<T>::value)>
+constexpr KFR_INTRINSIC T pow5(const T& x)
+{
+ return pow4(x) * x;
+}
+KFR_FN(pow2)
+KFR_FN(pow3)
+KFR_FN(pow4)
+KFR_FN(pow5)
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow2, E1> pow2(E1&& x)
+{
+ return { fn::pow2(), std::forward<E1>(x) };
+}
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow3, E1> pow3(E1&& x)
+{
+ return { fn::pow3(), std::forward<E1>(x) };
+}
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow4, E1> pow4(E1&& x)
+{
+ return { fn::pow4(), std::forward<E1>(x) };
+}
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::pow5, E1> pow5(E1&& x)
+{
+ return { fn::pow5(), std::forward<E1>(x) };
+}
+
+/// Raise x to the power base \f$ x^{base} \f$
+/// @code
+/// CHECK( ipow( 10, 3 ) == 1000 );
+/// CHECK( ipow( 0.5, 2 ) == 0.25 );
+/// @endcode
+template <typename T>
+constexpr inline T ipow(const T& x, int base)
+{
+ T xx = x;
+ T result = T(1);
+ while (base)
+ {
+ if (base & 1)
+ result *= xx;
+ base >>= 1;
+ xx *= xx;
+ }
+ return result;
+}
+KFR_FN(ipow)
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::ipow, E1, E2> ipow(E1&& x, E2&& b)
+{
+ return { fn::ipow(), std::forward<E1>(x), std::forward<E2>(b) };
+}
+
+/// Return square of the sum of all arguments
+/// @code
+/// CHECK(sqrsum(1,2,3) == 36);
+/// @endcode
+template <typename T1, typename... Ts>
+constexpr inline common_type<T1, Ts...> sqrsum(const T1& x, const Ts&... rest)
+{
+ return sqr(add(x, rest...));
+}
+
+template <typename T1, typename T2>
+constexpr inline common_type<T1, T2> sqrdiff(const T1& x, const T2& y)
+{
+ return sqr(x - y);
+}
+KFR_FN(sqrsum)
+KFR_FN(sqrdiff)
+
+/// Division
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout div(const T1& x, const T2& y)
+{
+ return static_cast<Tout>(x) / static_cast<Tout>(y);
+}
+KFR_FN(div)
+
+/// Remainder
+template <typename T1, typename T2, typename Tout = common_type<T1, T2>>
+KFR_INTRINSIC Tout rem(const T1& x, const T2& y)
+{
+ return static_cast<Tout>(x) % static_cast<Tout>(y);
+}
+KFR_FN(rem)
+
+/// Negation
+template <typename T1>
+inline T1 neg(const T1& x)
+{
+ return -x;
+}
+KFR_FN(neg)
+
+/// @brief Fused Multiply-Add
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> fmadd(const T1& x, const T2& y, const T3& z)
+{
+ return x * y + z;
+}
+/// @brief Fused Multiply-Sub
+template <typename T1, typename T2, typename T3>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> fmsub(const T1& x, const T2& y, const T3& z)
+{
+ return x * y - z;
+}
+KFR_FN(fmadd)
+KFR_FN(fmsub)
+
+/// @brief Linear blend of `x` and `y` (`c` must be in the range 0...+1)
+/// Returns `x + ( y - x ) * c`
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> mix(const T1& c, const T2& x, const T3& y)
+{
+ return fmadd(c, y - x, x);
+}
+
+/// @brief Linear blend of `x` and `y` (`c` must be in the range -1...+1)
+template <typename T1, typename T2, typename T3, KFR_ENABLE_IF(is_numeric_args<T1, T2, T3>::value)>
+KFR_INTRINSIC constexpr common_type<T1, T2, T3> mixs(const T1& c, const T2& x, const T3& y)
+{
+ return mix(fmadd(c, 0.5, 0.5), x, y);
+}
+KFR_FN(mix)
+KFR_FN(mixs)
+
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mix, E1, E2, E3> mix(E1&& c, E2&& x, E3&& y)
+{
+ return { fn::mix(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
+}
+
+template <typename E1, typename E2, typename E3, KFR_ENABLE_IF(is_input_expressions<E1, E2, E3>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mixs, E1, E2, E3> mixs(E1&& c, E2&& x, E3&& y)
+{
+ return { fn::mixs(), std::forward<E1>(c), std::forward<E2>(x), std::forward<E3>(y) };
+}
+
+namespace intrinsics
+{
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> horner(const T1&, const T2& c0)
+{
+ return c0;
+}
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner(const T1& x, const T2& c0, const T3& c1,
+ const Ts&... values)
+{
+ return fmadd(horner(x, c1, values...), x, c0);
+}
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> horner_even(const T1&, const T2& c0)
+{
+ return c0;
+}
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner_even(const T1& x, const T2& c0, const T3& c2,
+ const Ts&... values)
+{
+ const T1 x2 = x * x;
+ return fmadd(horner(x2, c2, values...), x2, c0);
+}
+
+template <typename T1, typename T2>
+constexpr KFR_INTRINSIC common_type<T1, T2> horner_odd(const T1& x, const T2& c1)
+{
+ return c1 * x;
+}
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+constexpr KFR_INTRINSIC common_type<T1, T2, T3, Ts...> horner_odd(const T1& x, const T2& c1, const T3& c3,
+ const Ts&... values)
+{
+ const T1 x2 = x * x;
+ return fmadd(horner(x2, c3, values...), x2, c1) * x;
+}
+} // namespace intrinsics
+
+/// @brief Calculate polynomial using Horner's method
+///
+/// ``horner(x, 1, 2, 3)`` is equivalent to \(3x^2 + 2x + 1\)
+template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, Ts...> horner(const T1& x, const Ts&... c)
+{
+ return intrinsics::horner(x, c...);
+}
+KFR_FN(horner)
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::horner, E...> horner(E&&... x)
+{
+ return { fn::horner(), std::forward<E>(x)... };
+}
+
+/// @brief Calculate polynomial using Horner's method (even powers)
+///
+/// ``horner_even(x, 1, 2, 3)`` is equivalent to \(3x^4 + 2x^2 + 1\)
+template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, Ts...> horner_even(const T1& x, const Ts&... c)
+{
+ return intrinsics::horner_even(x, c...);
+}
+KFR_FN(horner_even)
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::horner_even, E...> horner_even(E&&... x)
+{
+ return { fn::horner_even(), std::forward<E>(x)... };
+}
+
+/// @brief Calculate polynomial using Horner's method (odd powers)
+///
+/// ``horner_odd(x, 1, 2, 3)`` is equivalent to \(3x^5 + 2x^3 + 1x\)
+template <typename T1, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T1, Ts...>::value)>
+constexpr KFR_INTRINSIC common_type<T1, Ts...> horner_odd(const T1& x, const Ts&... c)
+{
+ return intrinsics::horner_odd(x, c...);
+}
+KFR_FN(horner_odd)
+
+template <typename... E, KFR_ENABLE_IF(is_input_expressions<E...>::value)>
+KFR_INTRINSIC internal::expression_function<fn::horner_odd, E...> horner_odd(E&&... x)
+{
+ return { fn::horner_odd(), std::forward<E>(x)... };
+}
+
+/// @brief Calculate Multiplicative Inverse of `x`
+/// Returns `1/x`
+template <typename T>
+constexpr KFR_INTRINSIC T reciprocal(const T& x)
+{
+ static_assert(std::is_floating_point<subtype<T>>::value, "T must be floating point type");
+ return subtype<T>(1) / x;
+}
+KFR_FN(reciprocal)
+
+template <typename T1, typename T2>
+KFR_INTRINSIC common_type<T1, T2> mulsign(const T1& x, const T2& y)
+{
+ return bitwisexor(x, bitwiseand(y, special_constants<T2>::highbitmask()));
+}
+KFR_FN(mulsign)
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return (x & special_constants<T>::highbitmask()) | (y & special_constants<T>::highbitmask());
+}
+
+/// @brief Swap byte order
+template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)>
+KFR_INTRINSIC vec<T, N> swapbyteorder(const vec<T, N>& x)
+{
+ return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x)));
+}
+template <typename T, KFR_ENABLE_IF(sizeof(T) == 8)>
+KFR_INTRINSIC T swapbyteorder(const T& x)
+{
+ return reinterpret_cast<const T&>(__builtin_bswap64(reinterpret_cast<const u64&>(x)));
+}
+template <typename T, KFR_ENABLE_IF(sizeof(T) == 4)>
+KFR_INTRINSIC T swapbyteorder(const T& x)
+{
+ return reinterpret_cast<const T&>(__builtin_bswap32(reinterpret_cast<const u32&>(x)));
+}
+template <typename T, KFR_ENABLE_IF(sizeof(T) == 2)>
+KFR_INTRINSIC T swapbyteorder(const T& x)
+{
+ return reinterpret_cast<const T&>(__builtin_bswap16(reinterpret_cast<const u16&>(x)));
+}
+KFR_FN(swapbyteorder)
+
+template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
+KFR_INTRINSIC vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return blend<1, 0>(a + b, a - b);
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
+KFR_INTRINSIC vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
+{
+ return blend<0, 1>(a + b, a - b);
+}
+KFR_FN(subadd)
+KFR_FN(addsub)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> negeven(const vec<T, N>& x)
+{
+ return x ^ broadcast<N>(-T(), T());
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> negodd(const vec<T, N>& x)
+{
+ return x ^ broadcast<N>(T(), -T());
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::neg, E1> operator-(E1&& e1)
+{
+ return { fn::neg(), std::forward<E1>(e1) };
+}
+
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwisenot, E1> operator~(E1&& e1)
+{
+ return { fn::bitwisenot(), std::forward<E1>(e1) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::add, E1, E2> operator+(E1&& e1, E2&& e2)
+{
+ return { fn::add(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::sub, E1, E2> operator-(E1&& e1, E2&& e2)
+{
+ return { fn::sub(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::mul, E1, E2> operator*(E1&& e1, E2&& e2)
+{
+ return { fn::mul(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::div, E1, E2> operator/(E1&& e1, E2&& e2)
+{
+ return { fn::div(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwiseand, E1, E2> operator&(E1&& e1, E2&& e2)
+{
+ return { fn::bitwiseand(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwiseor, E1, E2> operator|(E1&& e1, E2&& e2)
+{
+ return { fn::bitwiseor(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::bitwisexor, E1, E2> operator^(E1&& e1, E2&& e2)
+{
+ return { fn::bitwisexor(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::shl, E1, E2> operator<<(E1&& e1, E2&& e2)
+{
+ return { fn::shl(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename E1, typename E2, KFR_ENABLE_IF(is_input_expressions<E1, E2>::value)>
+KFR_INTRINSIC internal::expression_function<fn::shr, E1, E2> operator>>(E1&& e1, E2&& e2)
+{
+ return { fn::shr(), std::forward<E1>(e1), std::forward<E2>(e2) };
+}
+
+template <typename T, size_t N1, size_t... Ns>
+vec<vec<T, sizeof...(Ns) + 1>, N1> packtranspose(const vec<T, N1>& x, const vec<T, Ns>&... rest)
+{
+ const vec<T, N1*(sizeof...(Ns) + 1)> t = transpose<N1>(concat(x, rest...));
+ return t.v;
+}
+
+KFR_FN(packtranspose)
+
+template <typename T, size_t N>
+KFR_I_CE mask<T, N>::mask(const base& v) CMT_NOEXCEPT
+{
+ this->v = base::frombits((vec<itype<T>, N>::frombits(v) < itype<T>(0)).asvec()).v;
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp
@@ -0,0 +1,286 @@
+/** @addtogroup types
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "types.hpp"
+
+namespace kfr
+{
+
+/// @brief An enumeration representing cpu instruction set
+enum class cpu_t : int
+{
+ generic = 0,
+#ifdef CMT_ARCH_X86
+ sse2 = 1,
+ sse3 = 2,
+ ssse3 = 3,
+ sse41 = 4,
+ sse42 = 5,
+ avx1 = 6,
+ avx2 = 7,
+ avx512 = 8, // F, CD, VL, DQ and BW
+ avx = static_cast<int>(avx1),
+ lowest = static_cast<int>(sse2),
+ highest = static_cast<int>(avx512),
+#endif
+#ifdef CMT_ARCH_ARM
+ neon = 1,
+ neon64 = 2,
+ lowest = static_cast<int>(neon),
+ highest = static_cast<int>(neon64),
+#endif
+ native = static_cast<int>(CMT_ARCH_NAME),
+
+#ifdef CMT_ARCH_AVX
+#define KFR_HAS_SECONDARY_PLATFORM
+ secondary = static_cast<int>(sse42),
+#else
+ secondary = static_cast<int>(native),
+#endif
+
+ common = generic, // For compatibility
+ runtime = -1,
+};
+
+#define KFR_ARCH_DEP cpu_t cpu = cpu_t::native
+
+template <cpu_t cpu>
+using ccpu_t = cval_t<cpu_t, cpu>;
+
+template <cpu_t cpu>
+constexpr ccpu_t<cpu> ccpu{};
+
+namespace internal_generic
+{
+constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
+constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
+
+#ifdef CMT_ARCH_X86
+constexpr auto cpu_list = cvals_t<cpu_t, cpu_t::avx512, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3,
+ cpu_t::sse3, cpu_t::sse2>();
+#else
+constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>;
+#endif
+} // namespace internal_generic
+
+template <cpu_t cpu>
+using cpuval_t = cval_t<cpu_t, cpu>;
+template <cpu_t cpu>
+constexpr auto cpuval = cpuval_t<cpu>{};
+
+constexpr auto cpu_all =
+ cfilter(internal_generic::cpu_list, internal_generic::cpu_list >= cpuval_t<cpu_t::native>());
+
+/// @brief Returns name of the cpu instruction set
+CMT_UNUSED static const char* cpu_name(cpu_t set)
+{
+#ifdef CMT_ARCH_X86
+ static const char* names[] = { "generic", "sse2", "sse3", "ssse3", "sse41",
+ "sse42", "avx", "avx2", "avx512" };
+#endif
+#ifdef CMT_ARCH_ARM
+ static const char* names[] = { "generic", "neon", "neon64" };
+#endif
+ if (set >= cpu_t::lowest && set <= cpu_t::highest)
+ return names[static_cast<size_t>(set)];
+ return "-";
+}
+
+#ifdef CMT_ARCH_X64
+template <int = 0>
+constexpr inline const char* bitness_const(const char*, const char* x64)
+{
+ return x64;
+}
+template <typename T>
+constexpr inline const T& bitness_const(const T&, const T& x64)
+{
+ return x64;
+}
+#else
+template <int = 0>
+constexpr inline const char* bitness_const(const char* x32, const char*)
+{
+ return x32;
+}
+template <typename T>
+constexpr inline const T& bitness_const(const T& x32, const T&)
+{
+ return x32;
+}
+#endif
+
+template <cpu_t c = cpu_t::native>
+struct platform;
+
+#ifdef CMT_ARCH_X86
+template <>
+struct platform<cpu_t::common>
+{
+ constexpr static size_t native_cache_alignment = 64;
+ constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1;
+ constexpr static size_t maximum_vector_alignment = 64;
+ constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
+
+ constexpr static size_t simd_register_count = 1;
+
+ constexpr static size_t common_float_vector_size = 16;
+ constexpr static size_t common_int_vector_size = 16;
+
+ constexpr static size_t minimum_float_vector_size = 16;
+ constexpr static size_t minimum_int_vector_size = 16;
+
+ constexpr static size_t native_float_vector_size = 16;
+ constexpr static size_t native_int_vector_size = 16;
+
+ constexpr static size_t native_vector_alignment = 16;
+ constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+ constexpr static bool fast_unaligned = false;
+};
+template <>
+struct platform<cpu_t::sse2> : platform<cpu_t::common>
+{
+ constexpr static size_t simd_register_count = bitness_const(8, 16);
+};
+template <>
+struct platform<cpu_t::sse3> : platform<cpu_t::sse2>
+{
+};
+template <>
+struct platform<cpu_t::ssse3> : platform<cpu_t::sse3>
+{
+};
+template <>
+struct platform<cpu_t::sse41> : platform<cpu_t::ssse3>
+{
+};
+template <>
+struct platform<cpu_t::sse42> : platform<cpu_t::sse41>
+{
+};
+template <>
+struct platform<cpu_t::avx> : platform<cpu_t::sse42>
+{
+ constexpr static size_t native_float_vector_size = 32;
+
+ constexpr static size_t native_vector_alignment = 32;
+ constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+ constexpr static bool fast_unaligned = true;
+};
+template <>
+struct platform<cpu_t::avx2> : platform<cpu_t::avx>
+{
+ constexpr static size_t native_int_vector_size = 32;
+};
+template <>
+struct platform<cpu_t::avx512> : platform<cpu_t::avx2>
+{
+ constexpr static size_t native_float_vector_size = 64;
+ constexpr static size_t native_int_vector_size = 64;
+
+ constexpr static size_t native_vector_alignment = 64;
+ constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+ constexpr static size_t simd_register_count = bitness_const(8, 32);
+};
+#endif
+#ifdef CMT_ARCH_ARM
+template <>
+struct platform<cpu_t::common>
+{
+ constexpr static size_t native_cache_alignment = 64;
+ constexpr static size_t native_cache_alignment_mask = native_cache_alignment - 1;
+ constexpr static size_t maximum_vector_alignment = 16;
+ constexpr static size_t maximum_vector_alignment_mask = maximum_vector_alignment - 1;
+
+ constexpr static size_t simd_register_count = 1;
+
+ constexpr static size_t common_float_vector_size = 16;
+ constexpr static size_t common_int_vector_size = 16;
+
+ constexpr static size_t minimum_float_vector_size = 16;
+ constexpr static size_t minimum_int_vector_size = 16;
+
+ constexpr static size_t native_float_vector_size = 16;
+ constexpr static size_t native_int_vector_size = 16;
+
+ constexpr static size_t native_vector_alignment = 16;
+ constexpr static size_t native_vector_alignment_mask = native_vector_alignment - 1;
+
+ constexpr static bool fast_unaligned = false;
+};
+template <>
+struct platform<cpu_t::neon> : platform<cpu_t::common>
+{
+};
+template <>
+struct platform<cpu_t::neon64> : platform<cpu_t::neon>
+{
+};
+#endif
+
+inline namespace CMT_ARCH_NAME
+{
+
+/// @brief SIMD vector width for the given cpu instruction set
+template <typename T>
+constexpr static size_t vector_width =
+ (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::native_float_vector_size / sizeof(T)
+ : platform<>::native_int_vector_size / sizeof(T)));
+
+template <typename T>
+constexpr static size_t minimum_vector_width =
+ (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::minimum_float_vector_size / sizeof(T)
+ : platform<>::minimum_int_vector_size / sizeof(T)));
+
+template <typename T>
+constexpr static size_t vector_capacity = platform<>::simd_register_count* vector_width<T>;
+
+#ifdef CMT_COMPILER_MSVC
+template <typename T>
+constexpr static size_t maximum_vector_size = const_min(static_cast<size_t>(32), vector_width<T> * 2);
+#else
+template <typename T>
+constexpr static size_t maximum_vector_size = const_min(
+ static_cast<size_t>(32), const_max(size_t(1), platform<>::simd_register_count / 4) * vector_width<T>);
+#endif
+
+template <typename T>
+constexpr static bool is_simd_size(size_t size)
+{
+ return is_poweroftwo(size) && size >= minimum_vector_width<T> && size <= vector_width<T>;
+}
+
+template <typename T, size_t N = vector_width<T>>
+struct vec;
+template <typename T, size_t N = vector_width<T>>
+struct mask;
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/read_write.hpp b/include/kfr/simd/read_write.hpp
@@ -0,0 +1,243 @@
+/** @addtogroup read_write
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "shuffle.hpp"
+#include "types.hpp"
+#include "vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+template <size_t N, bool A = false, typename T>
+KFR_INTRINSIC static vec<T, N> read(const T* src)
+{
+ return vec<T, N>(src, cbool_t<A>());
+}
+
+template <bool A = false, size_t N, typename T>
+KFR_INTRINSIC static void write(T* dest, const vec<T, N>& value)
+{
+ value.write(dest, cbool_t<A>());
+}
+
+template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
+KFR_INTRINSIC vec<T, Nout> gather(const T* base, size_t index, Indices... indices)
+{
+ return make_vector(base[index], base[indices]...);
+}
+
+template <size_t Index, size_t... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
+KFR_INTRINSIC vec<T, Nout> gather(const T* base)
+{
+ return make_vector(base[Index], base[Indices]...);
+}
+
+template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0>
+KFR_INTRINSIC void scatter(const T* base, const vec<T, N>& value)
+{
+ base[Index] = value[InIndex];
+ scatter<Indices..., T, N, InIndex + 1>(base, value);
+}
+
+namespace internal
+{
+template <typename T, size_t N, size_t... Indices>
+KFR_INTRINSIC vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
+{
+ return make_vector(base[indices[Indices]]...);
+}
+template <size_t Nout, size_t Stride, typename T, size_t... Indices>
+KFR_INTRINSIC vec<T, Nout> gather_stride(const T* base, csizes_t<Indices...>)
+{
+ return make_vector(base[Indices * Stride]...);
+}
+template <size_t Nout, size_t groupsize, typename T, size_t... Indices>
+KFR_INTRINSIC vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<Indices...>)
+{
+ return make_vector(read<groupsize>(base + Indices * groupsize * stride)...);
+}
+} // namespace internal
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> gather(const T* base, const vec<u32, N>& indices)
+{
+ return internal::gather(base, indices, csizeseq<N>);
+}
+
+template <size_t Nout, size_t groupsize = 1, typename T>
+KFR_INTRINSIC vec<T, Nout * groupsize> gather_stride(const T* base, size_t stride)
+{
+ return internal::gather_stride_s<Nout, groupsize>(base, stride, csizeseq<Nout>);
+}
+
+template <size_t Nout, size_t Stride, typename T>
+KFR_INTRINSIC vec<T, Nout> gather_stride(const T* base)
+{
+ return internal::gather_stride<Nout, Stride>(base, csizeseq<Nout>);
+}
+
+template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
+KFR_INTRINSIC vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset,
+ csizes_t<Indices...>)
+{
+ return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...);
+}
+template <size_t groupsize = 1, typename T, size_t N, typename IT>
+KFR_INTRINSIC vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
+{
+ return gather_helper<groupsize>(base, offset, csizeseq<N>);
+}
+
+template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
+KFR_INTRINSIC void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value,
+ csizes_t<Indices...>)
+{
+ swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)),
+ 0)... };
+}
+template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, size_t... Indices>
+KFR_INTRINSIC void scatter_helper_s(T* base, size_t stride, const vec<T, Nout>& value, csizes_t<Indices...>)
+{
+ swallow{ (write(base + groupsize * stride, slice<Indices * groupsize, groupsize>(value)), 0)... };
+}
+template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
+KFR_INTRINSIC void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
+{
+ return scatter_helper<groupsize>(base, offset, value, csizeseq<N>);
+}
+
+template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
+KFR_INTRINSIC void scatter_stride(T* base, const vec<T, Nout>& value, size_t stride)
+{
+ return scatter_helper_s<groupsize>(base, stride, value, csizeseq<N>);
+}
+
+template <typename T, size_t groupsize = 1>
+struct stride_pointer : public stride_pointer<const T, groupsize>
+{
+ template <size_t N>
+ void write(const vec<T, N>& val, csize_t<N> = csize_t<N>())
+ {
+ kfr::scatter_stride<N, groupsize>(this->ptr, val);
+ }
+};
+
+template <typename T, size_t groupsize>
+struct stride_pointer<const T, groupsize>
+{
+ const T* ptr;
+ const size_t stride;
+
+ template <size_t N>
+ vec<T, N> read(csize_t<N> = csize_t<N>())
+ {
+ return kfr::gather_stride<N, groupsize>(ptr, stride);
+ }
+};
+
+template <typename T>
+constexpr T partial_masks[] = { constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ constants<T>::allones(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T(),
+ T() };
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> partial_mask(size_t index)
+{
+ static_assert(N <= arraysize(partial_masks<T>) / 2,
+ "N must not be greater than half of partial_masks array");
+ return read<N>(&partial_masks<T>[0] + arraysize(partial_masks<T>) / 2 - index);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> partial_mask(size_t index, vec_shape<T, N>)
+{
+ return partial_mask<T, N>(index);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/shuffle.hpp b/include/kfr/simd/shuffle.hpp
@@ -0,0 +1,569 @@
+/** @addtogroup shuffle
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+#include "constants.hpp"
+#include "mask.hpp"
+#include "types.hpp"
+#include "vec.hpp"
+
+#include <tuple>
+#include <utility>
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec<T, Nout> low(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<Nout>);
+}
+
+template <typename T, size_t N, size_t Nout = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec_shape<T, Nout> low(vec_shape<T, N>)
+{
+ return {};
+}
+
+template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec<T, Nout> high(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<Nout, prev_poweroftwo(N - 1)>);
+}
+
+template <typename T, size_t N, size_t Nout = N - prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec_shape<T, Nout> high(vec_shape<T, N>)
+{
+ return {};
+}
+
+template <typename T, size_t... Ns>
+KFR_INTRINSIC vec<T, csum<size_t, Ns...>()> concat(const vec<T, Ns>&... vs) CMT_NOEXCEPT
+{
+ return vec<T, csum<size_t, Ns...>()>(
+ intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, Ns>::scalar_size()...>(vs.v...));
+}
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC vec<T, N1 + N2> concat2(const vec<T, N1>& x, const vec<T, N2>& y) CMT_NOEXCEPT
+{
+ return vec<T, csum<size_t, N1, N2>()>(
+ intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N1>::scalar_size(),
+ vec<T, N2>::scalar_size()>(x.v, y.v));
+}
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N * 4> concat4(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c,
+ const vec<T, N>& d) CMT_NOEXCEPT
+{
+ return intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N * 2>::scalar_size(),
+ vec<T, N * 2>::scalar_size()>(
+ intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(),
+ vec<T, N>::scalar_size()>(a.v, b.v),
+ intrinsics::simd_concat<typename vec<T, 1>::scalar_type, vec<T, N>::scalar_size(),
+ vec<T, N>::scalar_size()>(c.v, d.v));
+}
+
+template <size_t count, typename T, size_t N, size_t Nout = N* count>
+KFR_INTRINSIC vec<T, Nout> repeat(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<Nout> % csize<N>);
+}
+
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout != N)>
+KFR_INTRINSIC vec<T, Nout> resize(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<Nout> % csize<N>);
+}
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout == N)>
+constexpr KFR_INTRINSIC const vec<T, Nout>& resize(const vec<T, N>& x)
+{
+ return x;
+}
+
+namespace intrinsics
+{
+
+template <typename T, typename... Ts, size_t... indices, size_t Nin = sizeof...(Ts),
+ size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC vec<T, Nout> broadcast_helper(csizes_t<indices...>, const Ts&... values)
+{
+ const std::tuple<Ts...> tup(values...);
+ return vec<T, Nout>(std::get<indices % Nin>(tup)...);
+}
+} // namespace intrinsics
+
+template <size_t Nout, typename... Ts, typename C = typename std::common_type<Ts...>::type>
+KFR_INTRINSIC vec<C, Nout> broadcast(const Ts&... values)
+{
+ return intrinsics::broadcast_helper<C>(csizeseq<Nout>, values...);
+}
+KFR_FN(broadcast)
+
+template <size_t Ncount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N + Ncount> padhigh(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<N + Ncount>);
+}
+KFR_FN(padhigh)
+
+template <size_t Ncount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N + Ncount> padlow(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<N + Ncount, 0 - Ncount>);
+}
+KFR_FN(padlow)
+
+template <size_t Nout, typename T>
+KFR_INTRINSIC vec<T, Nout> extend(const vec<T, 1>& x)
+{
+ return vec<T, Nout>(x.front());
+}
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N != Nout)>
+KFR_INTRINSIC vec<T, Nout> extend(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<Nout>);
+}
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(N == Nout)>
+constexpr KFR_INTRINSIC const vec<T, Nout>& extend(const vec<T, N>& x)
+{
+ return x;
+}
+KFR_FN(extend)
+
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, count> slice(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<count, start>);
+}
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, count> slice(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.shuffle(y, csizeseq<count, start>);
+}
+KFR_FN(slice)
+
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> replace(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.shuffle(y, csizeseq<N> + (csizeseq<N> >= csize<start> && csizeseq<N> < csize<start + count>)*N);
+}
+KFR_FN(replace)
+
+template <size_t, typename T, size_t N>
+KFR_INTRINSIC void split(const vec<T, N>&)
+{
+}
+template <size_t start = 0, typename T, size_t N, size_t Nout, typename... Args>
+KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, Nout>& out, Args&&... args)
+{
+ out = x.shuffle(csizeseq<Nout, start>);
+ split<start + Nout>(x, std::forward<Args>(args)...);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, N / 2>& low, vec<T, N / 2>& high)
+{
+ low = x.shuffle(csizeseq<N / 2, 0>);
+ high = x.shuffle(csizeseq<N / 2, N / 2>);
+}
+template <typename T, size_t N>
+KFR_INTRINSIC void split(const vec<T, N>& x, vec<T, N / 4>& w0, vec<T, N / 4>& w1, vec<T, N / 4>& w2,
+ vec<T, N / 4>& w3)
+{
+ w0 = x.shuffle(csizeseq<N / 4, 0>);
+ w1 = x.shuffle(csizeseq<N / 4, N / 4>);
+ w2 = x.shuffle(csizeseq<N / 4, 2 * N / 4>);
+ w3 = x.shuffle(csizeseq<N / 4, 3 * N / 4>);
+}
+KFR_FN(split)
+
+template <size_t total, size_t number, typename T, size_t N, size_t Nout = N / total>
+KFR_INTRINSIC vec<T, Nout> part(const vec<T, N>& x)
+{
+ static_assert(N % total == 0, "N % total == 0");
+ return x.shuffle(csizeseq<Nout, number * Nout>);
+}
+KFR_FN(part)
+
+template <size_t start, size_t count, typename T, size_t N>
+KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.shuffle(y, csizeseq<count, start>);
+}
+
+template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 > N2)>
+KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
+{
+ return x.shuffle(y.shuffle(csizeseq<N1>), csizeseq<N1 * 2>).shuffle(csizeseq<count, start>);
+}
+
+template <size_t start, size_t count, typename T, size_t N1, size_t N2, KFR_ENABLE_IF(N1 < N2)>
+KFR_INTRINSIC vec<T, count> concat_and_slice(const vec<T, N1>& x, const vec<T, N2>& y)
+{
+ return x.shuffle(csizeseq<N2, -(N2 - N1)>)
+ .shuffle(y, csizeseq<N2 * 2>)
+ .shuffle(csizeseq<count, N2 - N1 + start>);
+}
+
+KFR_FN(concat_and_slice)
+
+template <size_t Nout, typename T, size_t N, KFR_ENABLE_IF(Nout > N)>
+KFR_INTRINSIC vec<T, Nout> widen(const vec<T, N>& x, identity<T> newvalue = T())
+{
+ static_assert(Nout > N, "Nout > N");
+ return concat(x, broadcast<Nout - N>(newvalue));
+}
+template <size_t Nout, typename T, typename TS>
+constexpr KFR_INTRINSIC const vec<T, Nout>& widen(const vec<T, Nout>& x, TS)
+{
+ return x;
+}
+KFR_FN(widen)
+
+template <size_t Nout, typename T, size_t N>
+KFR_INTRINSIC vec<T, Nout> narrow(const vec<T, N>& x)
+{
+ static_assert(Nout <= N, "Nout <= N");
+ return slice<0, Nout>(x);
+}
+KFR_FN(narrow)
+
+template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
+KFR_INTRINSIC vec<T, Nout> even(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq<Nout / group, 0, 2>));
+}
+KFR_FN(even)
+
+template <size_t group = 1, typename T, size_t N, size_t Nout = N / 2, KFR_ENABLE_IF(N >= 2 && (N & 1) == 0)>
+KFR_INTRINSIC vec<T, Nout> odd(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq<Nout / group, 1, 2>));
+}
+KFR_FN(odd)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> dupeven(const vec<T, N>& x)
+{
+ static_assert(N % 2 == 0, "N must be even");
+ return x.shuffle(csizeseq<N, 0, 1> & ~csize<1>);
+}
+KFR_FN(dupeven)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> dupodd(const vec<T, N>& x)
+{
+ static_assert(N % 2 == 0, "N must be even");
+ return x.shuffle(csizeseq<N, 0, 1> | csize<1>);
+}
+KFR_FN(dupodd)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N * 2> duphalfs(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq<N * 2> % csize<N>);
+}
+KFR_FN(duphalfs)
+
+template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> shuffle(const vec<T, N>& x, const vec<T, N>& y,
+ elements_t<Indices...> i = elements_t<Indices...>())
+{
+ return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] +
+ csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
+}
+KFR_FN(shuffle)
+
+template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> shufflegroups(const vec<T, N>& x, const vec<T, N>& y,
+ elements_t<Indices...> i = elements_t<Indices...>())
+{
+ return x.shuffle(y, scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
+ csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
+}
+KFR_FN(shufflegroups)
+
+template <size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> permute(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
+{
+ return x.shuffle(i[csizeseq_t<N>() % csize_t<count>()] +
+ csizeseq_t<N>() / csize_t<count>() * csize_t<count>());
+}
+KFR_FN(permute)
+
+template <size_t group, size_t... Indices, typename T, size_t N, size_t count = sizeof...(Indices)>
+KFR_INTRINSIC vec<T, N> permutegroups(const vec<T, N>& x, elements_t<Indices...> i = elements_t<Indices...>())
+{
+ return x.shuffle(scale<group>(i[csizeseq_t<N / group>() % csize_t<sizeof...(Indices)>()] +
+ csizeseq_t<N / group>() / csize_t<count>() * csize_t<count>()));
+}
+KFR_FN(permutegroups)
+
+namespace internal
+{
+
+template <typename T, size_t Nout, typename Fn, size_t... Indices>
+constexpr KFR_INTRINSIC vec<T, Nout> generate_vector(csizes_t<Indices...>)
+{
+ return make_vector(static_cast<T>(Fn()(Indices))...);
+}
+} // namespace internal
+
+template <typename T, size_t Nout, typename Fn>
+constexpr KFR_INTRINSIC vec<T, Nout> generate_vector()
+{
+ return internal::generate_vector<T, Nout, Fn>(cvalseq_t<size_t, Nout>());
+}
+KFR_FN(generate_vector)
+
+namespace internal
+{
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> evenmask()
+{
+ return broadcast<N>(maskbits<T>(true), maskbits<T>(false));
+}
+template <typename T, size_t N>
+KFR_INTRINSIC mask<T, N> oddmask()
+{
+ return broadcast<N>(maskbits<T>(false), maskbits<T>(true));
+}
+} // namespace internal
+
+template <typename T, size_t N, size_t Nout = N * 2>
+KFR_INTRINSIC vec<T, Nout> dup(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq_t<Nout>() / csize_t<2>());
+}
+KFR_FN(dup)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> duplow(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>());
+}
+KFR_FN(duplow)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> duphigh(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq_t<N>() % csize_t<N / 2>() + csize_t<N - N / 2>());
+}
+KFR_FN(duphigh)
+
+template <size_t... Indices, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> blend(const vec<T, N>& x, const vec<T, N>& y,
+ elements_t<Indices...> i = elements_t<Indices...>())
+{
+ return x.shuffle(y, i[csizeseq_t<N>() % csize_t<sizeof...(Indices)>()] * csize_t<N>() + csizeseq_t<N>());
+}
+KFR_FN(blend)
+
+template <size_t elements = 2, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> swap(const vec<T, N>& x)
+{
+ return x.shuffle(csizeseq_t<N>() ^ csize_t<elements - 1>());
+}
+CMT_FN_TPL((size_t elements), (elements), swap)
+
+template <size_t shift, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> rotatetwo(const vec<T, N>& lo, const vec<T, N>& hi)
+{
+ return shift == 0 ? lo : (shift == N ? hi : hi.shuffle(lo, csizeseq_t<N, N - shift>()));
+}
+
+template <size_t amount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> rotateright(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
+{
+ static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
+ return x.shuffle(csizeseq_t<N, N - amount>() % csize_t<N>());
+}
+KFR_FN(rotateright)
+
+template <size_t amount, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> rotateleft(const vec<T, N>& x, csize_t<amount> = csize_t<amount>())
+{
+ static_assert(amount >= 0 && amount < N, "amount >= 0 && amount < N");
+ return x.shuffle(csizeseq_t<N, amount>() % csize_t<N>());
+}
+KFR_FN(rotateleft)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> insertright(T x, const vec<T, N>& y)
+{
+ return concat_and_slice<1, N>(y, vec<T, 1>(x));
+}
+KFR_FN(insertright)
+
+template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> insertleft(T x, const vec<T, N>& y)
+{
+ return concat_and_slice<0, N>(vec<T, 1>(x), y);
+}
+KFR_FN(insertleft)
+
+template <size_t side1, size_t group = 1, typename T, size_t N, size_t size = N / group,
+ size_t side2 = size / side1, KFR_ENABLE_IF(size > 3)>
+KFR_INTRINSIC vec<T, N> transpose(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+ csizeseq_t<size>() / csize_t<side2>()));
+}
+template <size_t side, size_t group = 1, typename T, size_t N, KFR_ENABLE_IF(N / group <= 3)>
+KFR_INTRINSIC vec<T, N> transpose(const vec<T, N>& x)
+{
+ return x;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC vec<vec<T, N>, N> transpose(const vec<vec<T, N>, N>& x)
+{
+ return vec<vec<T, N>, N>::from_flatten(transpose<2>(x.flatten()));
+}
+KFR_FN(transpose)
+
+template <size_t side2, size_t group = 1, typename T, size_t N, size_t size = N / group,
+ size_t side1 = size / side2, KFR_ENABLE_IF(size > 3)>
+KFR_INTRINSIC vec<T, N> transposeinverse(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+ csizeseq_t<size>() / csize_t<side2>()));
+}
+template <size_t side, size_t groupsize = 1, typename T, size_t N, KFR_ENABLE_IF(N / groupsize <= 3)>
+KFR_INTRINSIC vec<T, N> transposeinverse(const vec<T, N>& x)
+{
+ return x;
+}
+KFR_FN(transposeinverse)
+
+template <size_t side, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> ctranspose(const vec<T, N>& x)
+{
+ return transpose<side, 2>(x);
+}
+KFR_FN(ctranspose)
+
+template <size_t side, typename T, size_t N>
+KFR_INTRINSIC vec<T, N> ctransposeinverse(const vec<T, N>& x)
+{
+ return transposeinverse<side, 2>(x);
+}
+KFR_FN(ctransposeinverse)
+
+template <size_t group = 1, typename T, size_t N, size_t Nout = N * 2, size_t size = Nout / group,
+ size_t side2 = 2, size_t side1 = size / side2>
+KFR_INTRINSIC vec<T, Nout> interleave(const vec<T, N>& x, const vec<T, N>& y)
+{
+ return x.shuffle(y, scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+ csizeseq_t<size>() / csize_t<side2>()));
+}
+KFR_FN(interleave)
+
+template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side2 = 2,
+ size_t side1 = size / side2>
+KFR_INTRINSIC vec<T, N> interleavehalfs(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+ csizeseq_t<size>() / csize_t<side2>()));
+}
+KFR_FN(interleavehalfs)
+
+template <size_t group = 1, typename T, size_t N, size_t size = N / group, size_t side1 = 2,
+ size_t side2 = size / side1>
+KFR_INTRINSIC vec<T, N> splitpairs(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq_t<size>() % csize_t<side2>() * csize_t<side1>() +
+ csizeseq_t<size>() / csize_t<side2>()));
+}
+KFR_FN(splitpairs)
+
+template <size_t group = 1, typename T, size_t N, size_t size = N / group>
+KFR_INTRINSIC vec<T, N> reverse(const vec<T, N>& x)
+{
+ return x.shuffle(scale<group>(csizeseq_t<size, size - 1, -1>()));
+}
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC vec<vec<T, N1>, N2> reverse(const vec<vec<T, N1>, N2>& x)
+{
+ return swap<N1>(x.flatten()).v;
+}
+KFR_FN(reverse)
+
+template <typename T, size_t N1, size_t N2>
+KFR_INTRINSIC vec<T, N1> combine(const vec<T, N1>& x, const vec<T, N2>& y)
+{
+ static_assert(N2 <= N1, "N2 <= N1");
+ return x.shuffle(extend<N1>(y), (csizeseq_t<N1>() < csize_t<N2>()) * csize_t<N1>() + csizeseq_t<N1>());
+}
+KFR_FN(combine)
+
+namespace internal
+{
+template <size_t start, size_t stride>
+struct generate_index
+{
+ KFR_INTRINSIC constexpr size_t operator()(size_t index) const { return start + index * stride; }
+};
+template <size_t start, size_t size, int on, int off>
+struct generate_onoff
+{
+ KFR_INTRINSIC constexpr size_t operator()(size_t index) const
+ {
+ return index >= start && index < start + size ? on : off;
+ }
+};
+} // namespace internal
+
+template <typename T, size_t N, size_t start = 0, size_t stride = 1>
+constexpr KFR_INTRINSIC vec<T, N> enumerate()
+{
+ return generate_vector<T, N, internal::generate_index<start, stride>>();
+}
+template <size_t start = 0, size_t stride = 1, typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> enumerate(vec_shape<T, N>)
+{
+ return generate_vector<T, N, internal::generate_index<start, stride>>();
+}
+KFR_FN(enumerate)
+
+template <typename T, size_t N, size_t start = 0, size_t size = 1, int on = 1, int off = 0>
+constexpr KFR_INTRINSIC vec<T, N> onoff(cint_t<on> = cint_t<on>(), cint_t<off> = cint_t<off>())
+{
+ return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
+}
+template <size_t start = 0, size_t size = 1, int on = 1, int off = 0, typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> onoff(vec_shape<T, N>, cint_t<on> = cint_t<on>(),
+ cint_t<off> = cint_t<off>())
+{
+ return generate_vector<T, N, internal::generate_onoff<start, size, on, off>>();
+}
+KFR_FN(onoff)
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
+#define KFR_SHUFFLE_SPECIALIZATIONS 1
+#include "impl/specializations.i"
diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp
@@ -0,0 +1,372 @@
+/** @addtogroup types
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../kfr.h"
+
+#include "impl/intrinsics.h"
+
+#include <climits>
+
+#include <cmath>
+#include <limits>
+#include <random>
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers")
+
+#ifdef KFR_TESTING
+#include "../testo/testo.hpp"
+#endif
+
+#include "../cometa.hpp"
+#include "../cometa/numeric.hpp"
+
+namespace kfr
+{
+
+// Include all from CoMeta library
+using namespace cometa;
+
+using cometa::fbase;
+using cometa::fmax;
+
+// primary template (used for zero types)
+template <typename... T>
+struct common_type_impl
+{
+};
+
+template <typename... T>
+using decay_common = decay<common_type_impl<T...>>;
+
+template <typename T1, typename T2, template <typename TT> class result_type, typename = void>
+struct common_type_from_subtypes
+{
+};
+
+template <typename T1, typename T2, template <typename TT> class result_type>
+struct common_type_from_subtypes<T1, T2, result_type, void_t<typename common_type_impl<T1, T2>::type>>
+{
+ using type = result_type<typename common_type_impl<T1, T2>::type>;
+};
+
+template <typename T>
+struct common_type_impl<T>
+{
+ using type = decay<T>;
+};
+
+template <typename T1, typename T2>
+using common_for_two = decltype(false ? std::declval<T1>() : std::declval<T2>());
+
+template <typename T1, typename T2, typename = void>
+struct common_type_2_default
+{
+};
+
+template <typename T1, typename T2>
+struct common_type_2_default<T1, T2, void_t<common_for_two<T1, T2>>>
+{
+ using type = std::decay_t<common_for_two<T1, T2>>;
+};
+
+template <typename T1, typename T2, typename D1 = decay<T1>, typename D2 = decay<T2>>
+struct common_type_2_impl : common_type_impl<D1, D2>
+{
+};
+
+template <typename D1, typename D2>
+struct common_type_2_impl<D1, D2, D1, D2> : common_type_2_default<D1, D2>
+{
+};
+
+template <typename T1, typename T2>
+struct common_type_impl<T1, T2> : common_type_2_impl<T1, T2>
+{
+};
+
+template <typename AlwaysVoid, typename T1, typename T2, typename... R>
+struct common_type_multi_impl
+{
+};
+
+template <typename T1, typename T2, typename... R>
+struct common_type_multi_impl<void_t<typename common_type_impl<T1, T2>::type>, T1, T2, R...>
+ : common_type_impl<typename common_type_impl<T1, T2>::type, R...>
+{
+};
+
+template <typename T1, typename T2, typename... R>
+struct common_type_impl<T1, T2, R...> : common_type_multi_impl<void, T1, T2, R...>
+{
+};
+
+template <typename... T>
+using common_type = typename common_type_impl<T...>::type;
+
+constexpr ctypes_t<i8, i16, i32, i64> signed_types{};
+constexpr ctypes_t<u8, u16, u32, u64> unsigned_types{};
+constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64> integer_types{};
+constexpr ctypes_t<f32
+#ifdef KFR_NATIVE_F64
+ ,
+ f64
+#endif
+ >
+ float_types{};
+constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64, f32
+#ifdef KFR_NATIVE_F64
+ ,
+ f64
+#endif
+ >
+ numeric_types{};
+
+constexpr csizes_t<1, 2, 3, 4, 8, 16, 32, 64> test_vector_sizes{};
+
+template <template <typename, size_t> class vec_tpl, typename T,
+ typename sizes =
+#ifdef KFR_EXTENDED_TESTS
+ cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<64 / sizeof(T)>)>
+#else
+ csizes_t<1>
+#endif
+ >
+struct vector_types_for_size_t_impl;
+
+template <template <typename, size_t> class vec_tpl, typename T, size_t... sizes>
+struct vector_types_for_size_t_impl<vec_tpl, T, csizes_t<sizes...>>
+{
+ using type = ctypes_t<vec_tpl<T, sizes>...>;
+};
+
+template <template <typename, size_t> class vec_tpl, typename T>
+using vector_types_for_size_t = typename vector_types_for_size_t_impl<vec_tpl, T>::type;
+
+template <template <typename, size_t> class vec_tpl>
+using signed_vector_types_t =
+ concat_lists<vector_types_for_size_t<vec_tpl, i8>, vector_types_for_size_t<vec_tpl, i16>,
+ vector_types_for_size_t<vec_tpl, i32>, vector_types_for_size_t<vec_tpl, i64>>;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr signed_vector_types_t<vec_tpl> signed_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+using unsigned_vector_types_t =
+ concat_lists<vector_types_for_size_t<vec_tpl, u8>, vector_types_for_size_t<vec_tpl, u16>,
+ vector_types_for_size_t<vec_tpl, u32>, vector_types_for_size_t<vec_tpl, u64>>;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr unsigned_vector_types_t<vec_tpl> unsigned_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+using integer_vector_types_t = concat_lists<signed_vector_types_t<vec_tpl>, unsigned_vector_types_t<vec_tpl>>;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr integer_vector_types_t<vec_tpl> integer_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+using float_vector_types_t = concat_lists<vector_types_for_size_t<vec_tpl, f32>
+#ifdef KFR_NATIVE_F64
+ ,
+ vector_types_for_size_t<vec_tpl, f64>
+#endif
+ >;
+
+template <template <typename, size_t> class vec_tpl>
+constexpr float_vector_types_t<vec_tpl> float_vector_types{};
+
+template <template <typename, size_t> class vec_tpl>
+constexpr concat_lists<integer_vector_types_t<vec_tpl>, float_vector_types_t<vec_tpl>> numeric_vector_types{};
+
+struct u24
+{
+ u8 raw[3];
+};
+
+struct i24
+{
+ u8 raw[3];
+
+ constexpr i24(i32 x) CMT_NOEXCEPT : raw{}
+ {
+ raw[0] = x & 0xFF;
+ raw[1] = (x >> 8) & 0xFF;
+ raw[2] = (x >> 16) & 0xFF;
+ }
+
+ constexpr i32 as_int() const CMT_NOEXCEPT
+ {
+ return static_cast<i32>(raw[0]) | static_cast<i32>(raw[1] << 8) |
+ (static_cast<i32>(raw[2] << 24) >> 8);
+ }
+
+ operator int() const CMT_NOEXCEPT { return as_int(); }
+};
+
+struct f16
+{
+ u16 raw;
+};
+
+template <size_t bits>
+struct bitmask
+{
+ using type = conditional<(bits > 32), uint64_t,
+ conditional<(bits > 16), uint32_t, conditional<(bits > 8), uint16_t, uint8_t>>>;
+
+ bitmask(type val) : value(val) {}
+
+ type value;
+};
+
+template <typename T>
+struct maskbit
+{
+ bool value;
+};
+
+namespace fn_generic
+{
+///@copybrief cometa::pass_through
+using pass_through = cometa::fn_pass_through;
+
+///@copybrief cometa::noop
+using noop = cometa::fn_noop;
+
+///@copybrief cometa::get_first
+using get_first = cometa::fn_get_first;
+
+///@copybrief cometa::get_second
+using get_second = cometa::fn_get_second;
+
+///@copybrief cometa::get_third
+using get_third = cometa::fn_get_third;
+
+///@copybrief cometa::returns
+template <typename T>
+using returns = cometa::fn_returns<T>;
+} // namespace fn_generic
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wattributes")
+
+template <typename T, bool A>
+struct struct_with_alignment
+{
+ using pointer = struct_with_alignment*;
+ using const_pointer = const struct_with_alignment*;
+ T value;
+ KFR_MEM_INTRINSIC void operator=(T value) { this->value = value; }
+};
+
+#ifdef CMT_COMPILER_MSVC
+#define KFR_UNALIGNED_POINTER __unaligned
+#else
+#define KFR_UNALIGNED_POINTER
+#endif
+
+template <typename T>
+struct struct_with_alignment<T, false>
+{
+ using pointer = KFR_UNALIGNED_POINTER struct_with_alignment*;
+ using const_pointer = KFR_UNALIGNED_POINTER const struct_with_alignment*;
+ T value;
+ KFR_MEM_INTRINSIC void operator=(T value) { this->value = value; }
+}
+#ifdef CMT_GNU_ATTRIBUTES
+__attribute__((__packed__, __may_alias__)) //
+#endif
+;
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+/// @brief Fills a value with zeros
+template <typename T1>
+KFR_INTRINSIC void zeroize(T1& value)
+{
+ builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1));
+}
+
+/// @brief Used to determine the initial value for reduce functions
+template <typename T>
+struct initialvalue
+{
+};
+
+template <typename T>
+struct is_simd_type
+ : std::integral_constant<
+ bool, std::is_same<T, float>::value || std::is_same<T, double>::value ||
+ std::is_same<T, signed char>::value || std::is_same<T, unsigned char>::value ||
+ std::is_same<T, short>::value || std::is_same<T, unsigned short>::value ||
+ std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+ std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
+ std::is_same<T, long long>::value || std::is_same<T, unsigned long long>::value>
+{
+};
+
+template <typename T, size_t N>
+struct vec_shape
+{
+ using value_type = T;
+ constexpr static size_t size() CMT_NOEXCEPT { return N; }
+ constexpr vec_shape() CMT_NOEXCEPT = default;
+
+ using scalar_type = subtype<T>;
+ constexpr static size_t scalar_size() CMT_NOEXCEPT { return N * compound_type_traits<T>::width; }
+};
+
+constexpr size_t index_undefined = static_cast<size_t>(-1);
+
+struct czeros_t
+{
+};
+struct cones_t
+{
+};
+constexpr czeros_t czeros{};
+constexpr cones_t cones{};
+
+using caligned_t = cbool_t<true>;
+using cunaligned_t = cbool_t<false>;
+
+constexpr caligned_t caligned{};
+constexpr cunaligned_t cunaligned{};
+
+#ifdef CMT_INTRINSICS_IS_CONSTEXPR
+#define KFR_I_CE constexpr
+#else
+#define KFR_I_CE
+#endif
+
+#define avoid_odr_use(x) static_cast<decltype(x)>(x)
+
+} // namespace kfr
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp
@@ -0,0 +1,1283 @@
+/** @addtogroup types
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../version.hpp"
+#include "constants.hpp"
+#include "impl/backend.hpp"
+
+/**
+ * @brief Internal macro for functions
+ */
+#define KFR_FN(FN) \
+ namespace fn \
+ { \
+ struct FN \
+ { \
+ template <typename... Args> \
+ CMT_INLINE_MEMBER decltype(::kfr::FN(std::declval<Args>()...)) operator()(Args&&... args) const \
+ { \
+ return ::kfr::FN(std::forward<Args>(args)...); \
+ } \
+ }; \
+ }
+
+/**
+ * @brief Internal macro for functions
+ */
+#define KFR_I_FN(FN) \
+ namespace fn \
+ { \
+ struct FN \
+ { \
+ template <typename... Args> \
+ CMT_INLINE_MEMBER decltype(::kfr::intrinsics::FN(std::declval<Args>()...)) operator()( \
+ Args&&... args) const \
+ { \
+ return ::kfr::intrinsics::FN(std::forward<Args>(args)...); \
+ } \
+ }; \
+ }
+
+#define KFR_I_FN_FULL(FN, FULLFN) \
+ namespace fn \
+ { \
+ struct FN \
+ { \
+ template <typename... Args> \
+ CMT_INLINE_MEMBER decltype(FULLFN(std::declval<Args>()...)) operator()(Args&&... args) const \
+ { \
+ return FULLFN(std::forward<Args>(args)...); \
+ } \
+ }; \
+ }
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wc++98-compat-local-type-template-args")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpacked")
+
+CMT_PRAGMA_MSVC(warning(push))
+CMT_PRAGMA_MSVC(warning(disable : 4814))
+
+namespace kfr
+{
+
+inline namespace CMT_ARCH_NAME
+{
+
+template <typename T, size_t N>
+struct alignas(next_poweroftwo(sizeof(T)) * next_poweroftwo(N)) portable_vec
+{
+ static constexpr vec_shape<T, N> shape() CMT_NOEXCEPT { return {}; }
+
+ static_assert(N > 0 && N <= 1024, "Invalid vector size");
+
+ static_assert(is_simd_type<T>::value || !compound_type_traits<T>::is_scalar, "Invalid vector type");
+
+ // type and size
+ using value_type = T;
+
+ constexpr static size_t size() CMT_NOEXCEPT { return N; }
+
+ T elem[N];
+};
+
+template <typename T, size_t N>
+struct vec;
+
+template <typename T, size_t N>
+struct mask;
+
+template <typename T, size_t N>
+struct vec_halves
+{
+ vec<T, prev_poweroftwo(N - 1)> low;
+ vec<T, N - prev_poweroftwo(N - 1)> high;
+};
+
+template <typename T>
+struct vec_halves<T, 1>
+{
+ T val;
+};
+
+namespace internal
+{
+
+// scalar to scalar
+template <typename To, typename From>
+struct conversion
+{
+ static_assert(std::is_convertible<From, To>::value, "");
+
+ static To cast(const From& value) { return value; }
+};
+
+template <typename T>
+struct compoundcast
+{
+ static vec<T, 1> to_flat(const T& x) { return vec<T, 1>(x); }
+ static T from_flat(const vec<T, 1>& x) { return x.front(); }
+};
+template <typename T, size_t N>
+struct compoundcast<vec<T, N>>
+{
+ static const vec<T, N>& to_flat(const vec<T, N>& x) { return x; }
+ static const vec<T, N>& from_flat(const vec<T, N>& x) { return x; }
+};
+template <typename T, size_t N1, size_t N2>
+struct compoundcast<vec<vec<T, N1>, N2>>
+{
+ static vec<T, N1 * N2> to_flat(const vec<vec<T, N1>, N2>& x) { return x; }
+ static vec<vec<T, N1>, N2> from_flat(const vec<T, N1 * N2>& x) { return x; }
+};
+} // namespace internal
+
+template <typename T, size_t N>
+struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<T>::deep_subtype,
+ N * compound_type_traits<T>::deep_width>),
+ const_min(size_t(platform<>::native_vector_alignment),
+ next_poweroftwo(sizeof(typename compound_type_traits<T>::deep_subtype) *
+ N * compound_type_traits<T>::deep_width)))) vec
+{
+ static constexpr vec_shape<T, N> shape() CMT_NOEXCEPT { return {}; }
+
+ // type and size
+ using value_type = T;
+
+ constexpr static size_t size() CMT_NOEXCEPT { return N; }
+
+ using ST = typename compound_type_traits<T>::deep_subtype;
+ using scalar_type = ST;
+
+ enum : size_t
+ {
+ SW = compound_type_traits<T>::deep_width,
+ SN = N * SW
+ };
+
+ constexpr static size_t scalar_size() CMT_NOEXCEPT { return SN; }
+
+ static_assert(is_simd_type<scalar_type>::value, "Invalid vector type");
+
+ static_assert(scalar_size() > 0 && scalar_size() <= 1024, "Invalid vector size");
+
+ using mask_t = mask<T, N>;
+
+ using simd_type = intrinsics::simd<ST, SN>;
+ using uvalue_type = utype<T>;
+ using iuvalue_type = conditional<is_i_class<T>::value, T, uvalue_type>;
+
+ using uscalar_type = utype<ST>;
+ using iuscalar_type = conditional<is_i_class<ST>::value, ST, uscalar_type>;
+
+ using usimd_type = intrinsics::simd<uscalar_type, SN>;
+ using iusimd_type = intrinsics::simd<iuscalar_type, SN>;
+
+ // constructors and assignment
+ // from SIMD
+ KFR_MEM_INTRINSIC vec(const simd_type& simd) CMT_NOEXCEPT : v(simd) {}
+ // default
+ KFR_MEM_INTRINSIC constexpr vec() CMT_NOEXCEPT = default;
+ // copy
+ KFR_MEM_INTRINSIC constexpr vec(const vec& value) CMT_NOEXCEPT = default;
+ // move
+ KFR_MEM_INTRINSIC constexpr vec(vec&&) CMT_NOEXCEPT = default;
+ // assignment
+ KFR_MEM_INTRINSIC constexpr vec& operator=(const vec&) CMT_NOEXCEPT = default;
+
+ // from scalar
+ template <typename U,
+ KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT
+ : v(intrinsics::simd_broadcast(intrinsics::simd_t<ST, SN>{}, static_cast<ST>(s)))
+ {
+ }
+ template <typename U,
+ KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT
+ : v(intrinsics::simd_shuffle(intrinsics::simd_t<ST, SW>{},
+ internal::compoundcast<T>::to_flat(static_cast<T>(s)).v,
+ csizeseq<SN> % csize<SW>, overload_auto))
+ {
+ }
+
+ // from list
+ template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT
+ : v(intrinsics::simd_make(ctype<T>, s0, s1, static_cast<value_type>(rest)...))
+ {
+ }
+ template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT
+ : v(intrinsics::simd_concat<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>(
+ internal::compoundcast<T>::to_flat(s0).v, internal::compoundcast<T>::to_flat(s1).v,
+ internal::compoundcast<T>::to_flat(static_cast<T>(rest)).v...))
+ {
+ }
+
+ // from vector of another type
+ template <typename U,
+ KFR_ENABLE_IF(std::is_convertible<U, value_type>::value&& compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
+ : v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v))
+ {
+ }
+ template <typename U,
+ KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
+ : v(internal::conversion<vec<T, N>, vec<U, N>>::cast(x).v)
+ {
+ }
+
+ // from list of vectors
+ template <size_t... Ns, typename = enable_if<csum<size_t, Ns...>() == N>>
+ KFR_MEM_INTRINSIC vec(const vec<T, Ns>&... vs) CMT_NOEXCEPT
+ : v(intrinsics::simd_concat<ST, (SW * Ns)...>(vs.v...))
+ {
+ }
+
+ KFR_MEM_INTRINSIC vec(const portable_vec<T, N>& p) CMT_NOEXCEPT : vec(bitcast_anything<vec>(p)) {}
+
+ KFR_MEM_INTRINSIC operator portable_vec<T, N>() const CMT_NOEXCEPT
+ {
+ return bitcast_anything<portable_vec<T, N>>(*this);
+ }
+
+ KFR_MEM_INTRINSIC vec(czeros_t) CMT_NOEXCEPT : v(intrinsics::simd_zeros<ST, SN>()) {}
+
+ KFR_MEM_INTRINSIC vec(cones_t) CMT_NOEXCEPT : v(intrinsics::simd_allones<ST, SN>()) {}
+
+ template <typename U, size_t M, KFR_ENABLE_IF(sizeof(U) * M == sizeof(T) * N)>
+ KFR_MEM_INTRINSIC static vec frombits(const vec<U, M>& v) CMT_NOEXCEPT
+ {
+ return intrinsics::simd_bitcast(
+ intrinsics::simd_cvt_t<ST, typename vec<U, M>::scalar_type, vec<U, M>::scalar_size()>{}, v.v);
+ }
+
+ // shuffle
+ template <size_t... indices>
+ KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(csizes_t<indices...> i) const CMT_NOEXCEPT
+ {
+ return vec<value_type, sizeof...(indices)>(
+ intrinsics::simd_shuffle(intrinsics::simd_t<ST, SN>{}, v, scale<SW>(i), overload_auto));
+ }
+
+ template <size_t... indices>
+ KFR_MEM_INTRINSIC vec<value_type, sizeof...(indices)> shuffle(const vec& y,
+ csizes_t<indices...> i) const CMT_NOEXCEPT
+ {
+ return vec<value_type, sizeof...(indices)>(
+ intrinsics::simd_shuffle(intrinsics::simd2_t<ST, SN, SN>{}, v, y.v, scale<SW>(i), overload_auto));
+ }
+
+ // element access
+ struct element;
+
+ KFR_MEM_INTRINSIC constexpr value_type operator[](size_t index) const& CMT_NOEXCEPT { return get(index); }
+
+ KFR_MEM_INTRINSIC constexpr value_type operator[](size_t index) && CMT_NOEXCEPT { return get(index); }
+
+ KFR_MEM_INTRINSIC constexpr element operator[](size_t index) & CMT_NOEXCEPT { return { *this, index }; }
+
+ KFR_MEM_INTRINSIC value_type front() const CMT_NOEXCEPT { return get(csize<0>); }
+
+ KFR_MEM_INTRINSIC value_type back() const CMT_NOEXCEPT { return get(csize<N - 1>); }
+
+ template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT
+ {
+ return intrinsics::simd_get_element<T, N>(v, index);
+ }
+ template <int dummy = 0, typename = void,
+ KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT
+ {
+ return this->s[index];
+ }
+
+ template <size_t index, KFR_ENABLE_IF(index < 1024 && compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT
+ {
+ return intrinsics::simd_get_element<T, N>(v, csize<index>);
+ }
+ template <size_t index, typename = void,
+ KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT
+ {
+ return internal::compoundcast<T>::from_flat(intrinsics::simd_shuffle(
+ intrinsics::simd_t<ST, SN>{}, v, csizeseq<SW, SW * index>, overload_auto));
+ }
+
+ template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT
+ {
+ v = intrinsics::simd_set_element<T, N>(v, index, s);
+ }
+ template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT
+ {
+ this->s[index] = s;
+ }
+
+ template <size_t index, KFR_ENABLE_IF(index < 1024 && compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT
+ {
+ v = intrinsics::simd_set_element<T, N>(v, csize<index>, s);
+ }
+ template <size_t index, typename = void,
+ KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)>
+ KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT
+ {
+ this->s[index] = s;
+ }
+
+ struct element
+ {
+ constexpr operator value_type() const CMT_NOEXCEPT { return v.get(index); }
+
+ KFR_MEM_INTRINSIC element& operator=(const value_type& s) CMT_NOEXCEPT
+ {
+ v.set(index, s);
+ return *this;
+ }
+
+ KFR_MEM_INTRINSIC element& operator=(const element& s) CMT_NOEXCEPT
+ {
+ v.set(index, static_cast<value_type>(s));
+ return *this;
+ }
+
+ template <typename U, size_t M>
+ KFR_MEM_INTRINSIC element& operator=(const typename vec<U, M>::element& s) CMT_NOEXCEPT
+ {
+ v.set(index, static_cast<value_type>(static_cast<U>(s)));
+ return *this;
+ }
+
+ vec& v;
+ size_t index;
+ };
+
+ // read/write
+ template <bool aligned = false>
+ KFR_MEM_INTRINSIC explicit constexpr vec(const value_type* src,
+ cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT
+ : v(intrinsics::simd_read<SN, aligned>(ptr_cast<ST>(src)))
+ {
+ }
+
+ template <bool aligned = false>
+ KFR_MEM_INTRINSIC const vec& write(value_type* dest,
+ cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT
+ {
+ intrinsics::simd_write<aligned, SN>(ptr_cast<ST>(dest), v);
+ return *this;
+ }
+
+ KFR_MEM_INTRINSIC vec<ST, SN> flatten() const CMT_NOEXCEPT { return v; }
+ KFR_MEM_INTRINSIC static vec from_flatten(const vec<ST, SN>& x) { return vec(x.v); }
+
+ KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); }
+
+ constexpr static size_t simd_element_size = const_min(vector_width<T>, N);
+ constexpr static size_t simd_element_count = N / simd_element_size;
+ using simd_element_type = simd<ST, simd_element_size>;
+
+public:
+ union {
+ simd_type v;
+ vec_halves<T, N> h;
+ simd_element_type w[simd_element_count];
+ T s[N];
+ };
+};
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevector(const vec<T, N>& x,
+ csizes_t<indices...> i) CMT_NOEXCEPT
+{
+ return intrinsics::simd_shuffle(intrinsics::simd_t<T, N>{}, x.v, i, overload_auto);
+}
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC vec<T, sizeof...(indices)> shufflevectors(const vec<T, N>& x, const vec<T, N>& y,
+ csizes_t<indices...> i) CMT_NOEXCEPT
+{
+ return intrinsics::simd_shuffle(intrinsics::simd2_t<T, N, N>{}, x.v, y.v, i, overload_auto);
+}
+
+namespace internal
+{
+
+#if 0
+constexpr inline size_t scale_get_index(size_t counter, size_t groupsize, size_t index) CMT_NOEXCEPT
+{
+ return index == index_undefined ? index_undefined : (counter % groupsize + groupsize * index);
+}
+
+#ifdef CMT_COMPILER_MSVC
+template <size_t counter, size_t groupsize, size_t... indices>
+constexpr inline size_t scale_get_index(csizes_t<indices...>) CMT_NOEXCEPT
+{
+ return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
+}
+
+template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
+constexpr inline auto scale_impl(csizes_t<indices...> ind, csizes_t<counter...> cnt) CMT_NOEXCEPT
+ -> csizes_t<scale_get_index<counter, groupsize>(ind)...>
+{
+ return {};
+}
+#else
+
+template <size_t counter, size_t groupsize, size_t... indices>
+constexpr inline size_t scale_get_index() CMT_NOEXCEPT
+{
+ return scale_get_index(counter, groupsize, csizes_t<indices...>().get(csize_t<counter / groupsize>()));
+}
+
+template <size_t... indices, size_t... counter, size_t groupsize = sizeof...(counter) / sizeof...(indices)>
+constexpr inline auto scale_impl(csizes_t<indices...>, csizes_t<counter...>) CMT_NOEXCEPT
+ -> csizes_t<scale_get_index<counter, groupsize, indices...>()...>
+{
+ return {};
+}
+
+#endif
+#endif
+
+} // namespace internal
+
+template <size_t groupsize, size_t... indices>
+constexpr inline auto scale() CMT_NOEXCEPT
+{
+ return cconcat(csizeseq<groupsize, groupsize * indices>...);
+ // return internal::scale_impl(csizes_t<indices...>(), csizeseq<sizeof...(indices) * groupsize>);
+}
+
+namespace internal
+{
+template <typename T>
+struct is_vec_impl : std::false_type
+{
+};
+
+template <typename T, size_t N>
+struct is_vec_impl<vec<T, N>> : std::true_type
+{
+};
+} // namespace internal
+
+template <typename T>
+using is_vec = internal::is_vec_impl<T>;
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wold-style-cast")
+
+template <size_t N, typename T>
+constexpr KFR_INTRINSIC vec<T, N> broadcast(T x)
+{
+ return x;
+}
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+namespace internal
+{
+
+template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>,
+ size_t Nout = (N * compound_type_traits<To>::deep_width)>
+constexpr KFR_INTRINSIC vec<To, N> builtin_convertvector(const vec<From, N>& value) CMT_NOEXCEPT
+{
+ return vec<To, N>(value);
+}
+
+// vector to vector
+template <typename To, typename From, size_t N, size_t N2>
+struct conversion<vec<To, N>, vec<From, N2>>
+{
+ static_assert(N == N2, "");
+ static_assert(!is_compound<To>::value, "");
+ static_assert(!is_compound<From>::value, "");
+
+ static vec<To, N> cast(const vec<From, N>& value) { return vec<To, N>(value); }
+};
+
+// scalar to vector
+template <typename To, typename From, size_t N>
+struct conversion<vec<To, N>, From>
+{
+ static_assert(std::is_convertible<From, To>::value, "");
+
+ static vec<To, N> cast(const From& value) { return broadcast<N>(static_cast<To>(value)); }
+};
+} // namespace internal
+
+template <typename T>
+constexpr size_t size_of() CMT_NOEXCEPT
+{
+ return sizeof(deep_subtype<T>) * compound_type_traits<T>::deep_width;
+}
+
+template <typename From, size_t N, typename Tsub = deep_subtype<From>,
+ size_t Nout = N* size_of<From>() / size_of<Tsub>()>
+constexpr KFR_INTRINSIC vec<Tsub, Nout> flatten(const vec<From, N>& x) CMT_NOEXCEPT
+{
+ return x.flatten();
+}
+
+template <typename To, typename From,
+ typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
+constexpr KFR_INTRINSIC Tout cast(const From& value) CMT_NOEXCEPT
+{
+ return static_cast<Tout>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N> cast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+ return vec<Tout, N>(value);
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<vec<Tout, N1>, N2> cast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+ return vec<vec<Tout, N1>, N2>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<Tin, N>& cast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+ return value;
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<vec<Tin, N1>, N2>& cast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+ return value;
+}
+
+//
+
+template <typename To, typename From,
+ typename Tout = typename compound_type_traits<From>::template deep_rebind<To>>
+constexpr KFR_INTRINSIC Tout innercast(const From& value) CMT_NOEXCEPT
+{
+ return static_cast<Tout>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N> innercast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+ return vec<Tout, N>(value);
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<vec<Tout, N1>, N2> innercast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+ return vec<vec<Tout, N1>, N2>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<Tin, N>& innercast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+ return value;
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<vec<Tin, N1>, N2>& innercast(const vec<vec<Tin, N1>, N2>& value)
+ CMT_NOEXCEPT
+{
+ return value;
+}
+
+//
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N> elemcast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+ return vec<Tout, N>(value);
+}
+
+template <typename Tout, typename Tin, size_t N, KFR_ENABLE_IF(is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC const vec<Tin, N>& elemcast(const vec<Tin, N>& value) CMT_NOEXCEPT
+{
+ return value;
+}
+
+template <typename Tout, typename Tin, size_t N1, size_t N2, KFR_ENABLE_IF(!is_same<Tin, Tout>::value)>
+constexpr KFR_INTRINSIC vec<Tout, N2> elemcast(const vec<vec<Tin, N1>, N2>& value) CMT_NOEXCEPT
+{
+ return vec<Tout, N2>(value);
+}
+
+template <typename To, typename From>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC To bitcast(const From& value) CMT_NOEXCEPT
+{
+ static_assert(sizeof(From) == sizeof(To), "bitcast: Incompatible types");
+ union {
+ From from;
+ To to;
+ } u{ value };
+ return u.to;
+}
+
+template <typename To, typename From, size_t N, size_t Nout = (N * size_of<From>() / size_of<To>())>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<To, Nout> bitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+ return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To ubitcast(const From& value) CMT_NOEXCEPT
+{
+ return bitcast<To>(value);
+}
+
+template <typename From, typename To = itype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To ibitcast(const From& value) CMT_NOEXCEPT
+{
+ return bitcast<To>(value);
+}
+
+template <typename From, typename To = ftype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To fbitcast(const From& value) CMT_NOEXCEPT
+{
+ return bitcast<To>(value);
+}
+
+template <typename From, typename To = uitype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
+constexpr KFR_INTRINSIC To uibitcast(const From& value) CMT_NOEXCEPT
+{
+ return bitcast<To>(value);
+}
+
+template <typename From, size_t N, typename To = utype<From>,
+ size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> ubitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+ return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, size_t N, typename To = itype<From>,
+ size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> ibitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+ return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, size_t N, typename To = ftype<From>,
+ size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> fbitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+ return vec<To, Nout>::frombits(value);
+}
+
+template <typename From, size_t N, typename To = uitype<From>,
+ size_t Nout = size_of<From>() * N / size_of<To>()>
+constexpr KFR_INTRINSIC vec<To, Nout> uibitcast(const vec<From, N>& value) CMT_NOEXCEPT
+{
+ return vec<To, Nout>::frombits(value);
+}
+
+constexpr KFR_INTRINSIC size_t vector_alignment(size_t size) { return next_poweroftwo(size); }
+
+template <typename T, size_t N>
+struct pkd_vec
+{
+ constexpr pkd_vec() CMT_NOEXCEPT {}
+
+ pkd_vec(const vec<T, N>& value) CMT_NOEXCEPT { value.write(v); }
+
+ template <typename... Ts>
+ constexpr pkd_vec(Ts... init) CMT_NOEXCEPT : v{ static_cast<T>(init)... }
+ {
+ static_assert(N <= sizeof...(Ts), "Too few initializers for pkd_vec");
+ }
+
+private:
+ T v[N];
+ friend struct vec<T, N>;
+}
+#ifdef CMT_GNU_ATTRIBUTES
+__attribute__((packed))
+#endif
+;
+
+namespace internal
+{
+
+template <size_t, typename T>
+constexpr KFR_INTRINSIC T make_vector_get_n()
+{
+ return T();
+}
+
+template <size_t index, typename T, typename... Args>
+constexpr KFR_INTRINSIC T make_vector_get_n(const T& arg, const Args&... args)
+{
+ return index == 0 ? arg : make_vector_get_n<index - 1, T>(args...);
+}
+
+template <typename T, typename... Args, size_t... indices, size_t N = sizeof...(Args)>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> make_vector_impl(csizes_t<indices...>, const Args&... args)
+{
+ static_assert(sizeof...(indices) == sizeof...(Args), "");
+ const T list[] = { static_cast<T>(args)... };
+ return vec<T, N>(list[indices]...);
+}
+} // namespace internal
+
+/// Create vector from scalar values
+/// @code
+/// CHECK( make_vector( 1, 2, 3, 4 ) == i32x4{1, 2, 3, 4} );
+/// @endcode
+template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
+ typename SubType = fix_type<conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>>
+constexpr KFR_INTRINSIC vec<SubType, N> make_vector(const Arg& x, const Args&... rest)
+{
+ // static_assert(! is_same<SubType, unsigned long long>::value, "!!!--1");
+ // static_assert(! is_same<fix_type<SubType>, unsigned long long>::value, "!!!--2");
+ return internal::make_vector_impl<SubType>(cvalseq_t<size_t, N>(), static_cast<SubType>(x),
+ static_cast<SubType>(rest)...);
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> make_vector(const vec<T, N>& x)
+{
+ return x;
+}
+
+template <typename T, T... Values, size_t N = sizeof...(Values)>
+constexpr KFR_INTRINSIC vec<T, N> make_vector(cvals_t<T, Values...>)
+{
+ return make_vector<T>(Values...);
+}
+
+template <typename Type = void, typename Arg, typename... Args, size_t N = (sizeof...(Args) + 1),
+ typename SubType = fix_type<conditional<is_void<Type>::value, common_type<Arg, Args...>, Type>>,
+ KFR_ENABLE_IF(is_number<subtype<SubType>>::value)>
+constexpr KFR_INTRINSIC vec<SubType, N> pack(const Arg& x, const Args&... rest)
+{
+ return internal::make_vector_impl<SubType>(csizeseq<N>, static_cast<SubType>(x),
+ static_cast<SubType>(rest)...);
+}
+
+using f32x1 = vec<f32, 1>;
+using f32x2 = vec<f32, 2>;
+using f32x3 = vec<f32, 3>;
+using f32x4 = vec<f32, 4>;
+using f32x8 = vec<f32, 8>;
+using f32x16 = vec<f32, 16>;
+using f32x32 = vec<f32, 32>;
+using f32x64 = vec<f32, 64>;
+using f64x1 = vec<f64, 1>;
+using f64x2 = vec<f64, 2>;
+using f64x3 = vec<f64, 3>;
+using f64x4 = vec<f64, 4>;
+using f64x8 = vec<f64, 8>;
+using f64x16 = vec<f64, 16>;
+using f64x32 = vec<f64, 32>;
+using f64x64 = vec<f64, 64>;
+using i8x1 = vec<i8, 1>;
+using i8x2 = vec<i8, 2>;
+using i8x3 = vec<i8, 3>;
+using i8x4 = vec<i8, 4>;
+using i8x8 = vec<i8, 8>;
+using i8x16 = vec<i8, 16>;
+using i8x32 = vec<i8, 32>;
+using i8x64 = vec<i8, 64>;
+using i16x1 = vec<i16, 1>;
+using i16x2 = vec<i16, 2>;
+using i16x3 = vec<i16, 3>;
+using i16x4 = vec<i16, 4>;
+using i16x8 = vec<i16, 8>;
+using i16x16 = vec<i16, 16>;
+using i16x32 = vec<i16, 32>;
+using i16x64 = vec<i16, 64>;
+using i32x1 = vec<i32, 1>;
+using i32x2 = vec<i32, 2>;
+using i32x3 = vec<i32, 3>;
+using i32x4 = vec<i32, 4>;
+using i32x8 = vec<i32, 8>;
+using i32x16 = vec<i32, 16>;
+using i32x32 = vec<i32, 32>;
+using i32x64 = vec<i32, 64>;
+using i64x1 = vec<i64, 1>;
+using i64x2 = vec<i64, 2>;
+using i64x3 = vec<i64, 3>;
+using i64x4 = vec<i64, 4>;
+using i64x8 = vec<i64, 8>;
+using i64x16 = vec<i64, 16>;
+using i64x32 = vec<i64, 32>;
+using i64x64 = vec<i64, 64>;
+using u8x1 = vec<u8, 1>;
+using u8x2 = vec<u8, 2>;
+using u8x3 = vec<u8, 3>;
+using u8x4 = vec<u8, 4>;
+using u8x8 = vec<u8, 8>;
+using u8x16 = vec<u8, 16>;
+using u8x32 = vec<u8, 32>;
+using u8x64 = vec<u8, 64>;
+using u16x1 = vec<u16, 1>;
+using u16x2 = vec<u16, 2>;
+using u16x3 = vec<u16, 3>;
+using u16x4 = vec<u16, 4>;
+using u16x8 = vec<u16, 8>;
+using u16x16 = vec<u16, 16>;
+using u16x32 = vec<u16, 32>;
+using u16x64 = vec<u16, 64>;
+using u32x1 = vec<u32, 1>;
+using u32x2 = vec<u32, 2>;
+using u32x3 = vec<u32, 3>;
+using u32x4 = vec<u32, 4>;
+using u32x8 = vec<u32, 8>;
+using u32x16 = vec<u32, 16>;
+using u32x32 = vec<u32, 32>;
+using u32x64 = vec<u32, 64>;
+using u64x1 = vec<u64, 1>;
+using u64x2 = vec<u64, 2>;
+using u64x3 = vec<u64, 3>;
+using u64x4 = vec<u64, 4>;
+using u64x8 = vec<u64, 8>;
+using u64x16 = vec<u64, 16>;
+using u64x32 = vec<u64, 32>;
+using u64x64 = vec<u64, 64>;
+
+namespace glsl_names
+{
+using vec2 = f32x2;
+using vec3 = f32x3;
+using vec4 = f32x4;
+using dvec2 = f64x2;
+using dvec3 = f64x3;
+using dvec4 = f64x4;
+using ivec2 = i32x2;
+using ivec3 = i32x3;
+using ivec4 = i32x4;
+using uvec2 = u32x2;
+using uvec3 = u32x3;
+using uvec4 = u32x4;
+} // namespace glsl_names
+namespace opencl_names
+{
+using char2 = i8x2;
+using char3 = i8x3;
+using char4 = i8x4;
+using char8 = i8x8;
+using char16 = i8x16;
+using uchar2 = u8x2;
+using uchar3 = u8x3;
+using uchar4 = u8x4;
+using uchar8 = u8x8;
+using uchar16 = u8x16;
+
+using short2 = i16x2;
+using short3 = i16x3;
+using short4 = i16x4;
+using short8 = i16x8;
+using short16 = i16x16;
+using ushort2 = u16x2;
+using ushort3 = u16x3;
+using ushort4 = u16x4;
+using ushort8 = u16x8;
+using ushort16 = u16x16;
+
+using int2 = i32x2;
+using int3 = i32x3;
+using int4 = i32x4;
+using int8 = i32x8;
+using int16 = i32x16;
+using uint2 = u32x2;
+using uint3 = u32x3;
+using uint4 = u32x4;
+using uint8 = u32x8;
+using uint16 = u32x16;
+
+using long2 = i64x2;
+using long3 = i64x3;
+using long4 = i64x4;
+using long8 = i64x8;
+using long16 = i64x16;
+using ulong2 = u64x2;
+using ulong3 = u64x3;
+using ulong4 = u64x4;
+using ulong8 = u64x8;
+using ulong16 = u64x16;
+
+using float2 = f32x2;
+using float3 = f32x3;
+using float4 = f32x4;
+using float8 = f32x8;
+using float16 = f32x16;
+
+using double2 = f64x2;
+using double3 = f64x3;
+using double4 = f64x4;
+using double8 = f64x8;
+using double16 = f64x16;
+} // namespace opencl_names
+
+namespace internal
+{
+
+template <size_t Index, typename T, size_t N, typename Fn, typename... Args,
+ typename Tout = result_of<Fn(subtype<decay<Args>>...)>>
+constexpr KFR_INTRINSIC Tout applyfn_helper(Fn&& fn, Args&&... args)
+{
+ return fn(args[Index]...);
+}
+
+template <typename T, size_t N, typename Fn, typename... Args,
+ typename Tout = result_of<Fn(subtype<decay<Args>>...)>, size_t... Indices>
+constexpr KFR_INTRINSIC vec<Tout, N> apply_helper(Fn&& fn, csizes_t<Indices...>, Args&&... args)
+{
+ return make_vector(applyfn_helper<Indices, T, N>(std::forward<Fn>(fn), std::forward<Args>(args)...)...);
+}
+
+template <typename T, size_t N, typename Fn, size_t... Indices>
+constexpr KFR_INTRINSIC vec<T, N> apply0_helper(Fn&& fn, csizes_t<Indices...>)
+{
+ return make_vector(((void)Indices, void(), fn())...);
+}
+} // namespace internal
+
+template <typename T, size_t N, typename Fn, typename... Args,
+ typename Tout = result_of<Fn(T, subtype<decay<Args>>...)>>
+constexpr KFR_INTRINSIC vec<Tout, N> apply(Fn&& fn, const vec<T, N>& arg, Args&&... args)
+{
+ return internal::apply_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>, arg, std::forward<Args>(args)...);
+}
+
+template <typename T, typename Fn, typename... Args, typename Tout = result_of<Fn(T, decay<Args>...)>,
+ KFR_ENABLE_IF(is_same<T, subtype<T>>::value)>
+constexpr KFR_INTRINSIC Tout apply(Fn&& fn, const T& arg, Args&&... args)
+{
+ return fn(arg, args...);
+}
+
+template <size_t N, typename Fn, typename T = result_of<Fn()>>
+constexpr KFR_INTRINSIC vec<T, N> apply(Fn&& fn)
+{
+ return internal::apply0_helper<T, N>(std::forward<Fn>(fn), csizeseq<N>);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector()
+{
+ return vec<T, N>(czeros);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector(vec_shape<T, N>)
+{
+ return vec<T, N>(czeros);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> zerovector(vec<T, N>)
+{
+ return vec<T, N>(czeros);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector()
+{
+ return vec<T, N>(cones);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector(vec_shape<T, N>)
+{
+ return vec<T, N>(cones);
+}
+
+template <typename T, size_t N>
+CMT_GNU_CONSTEXPR KFR_INTRINSIC vec<T, N> allonesvector(vec<T, N>)
+{
+ return vec<T, N>(cones);
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> undefinedvector()
+{
+ return vec<T, N>{};
+}
+
+template <typename T, size_t N>
+constexpr KFR_INTRINSIC vec<T, N> undefinedvector(vec_shape<T, N>)
+{
+ return undefinedvector<T, N>();
+}
+
+template <size_t N>
+struct vec_template
+{
+ template <typename T>
+ using type = vec<T, N>;
+};
+
+#ifdef KFR_TESTING
+
+inline const std::vector<special_value>& special_values()
+{
+ static const std::vector<special_value> values{ special_constant::infinity,
+ special_constant::neg_infinity,
+ special_constant::min,
+ special_constant::lowest,
+ special_constant::max,
+ 3.1415926535897932384626433832795,
+ 4.499999,
+ 4.500001,
+ -4.499999,
+ -4.500001,
+ 0.1111111111111111111111111111111,
+ -0.4444444444444444444444444444444,
+ -1,
+ 0,
+ +1 };
+ return values;
+}
+
+namespace test_catogories
+{
+constexpr cint_t<1> scalars{};
+constexpr cint_t<2> vectors{};
+constexpr cint_t<3> all{};
+
+constexpr inline auto types(cint_t<0>) { return ctypes_t<>{}; }
+constexpr inline auto types(cint_t<1>) { return cconcat(numeric_types); }
+constexpr inline auto types(cint_t<2>) { return cconcat(numeric_vector_types<vec>); }
+constexpr inline auto types(cint_t<3>) { return cconcat(numeric_types, numeric_vector_types<vec>); }
+
+} // namespace test_catogories
+
+template <typename T, size_t N, size_t... indices>
+vec<T, N> test_enumerate(vec_shape<T, N>, csizes_t<indices...>, double start = 0, double step = 1)
+{
+ return make_vector<T>(static_cast<T>(start + step * indices)...);
+}
+
+template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_return_constant<bool, true>>
+void test_function1(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{})
+{
+ testo::matrix(
+ named("type") = test_catogories::types(cat), named("value") = special_values(),
+ [&](auto type, special_value value) {
+ using T = type_of<decltype(type)>;
+ if (isapplicable(ctype<T>, value))
+ {
+ const T x(value);
+ CHECK(std::is_same<decltype(fn(x)), typename compound_type_traits<T>::template rebind<
+ decltype(reffn(std::declval<subtype<T>>()))>>::value);
+ CHECK(fn(x) == apply(reffn, x));
+ }
+ });
+
+ testo::matrix(named("type") = test_catogories::types(cint<Cat & ~1>), [&](auto type) {
+ using T = type_of<decltype(type)>;
+ const T x = test_enumerate(T::shape(), csizeseq<T::size()>, 0);
+ CHECK(fn(x) == apply(reffn, x));
+ });
+}
+
+template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_return_constant<bool, true>>
+void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{})
+{
+ testo::matrix(
+ named("type") = test_catogories::types(cat),
+ named("value1") = special_values(), //
+ named("value2") = special_values(), [&](auto type, special_value value1, special_value value2) {
+ using T = type_of<decltype(type)>;
+ const T x1(value1);
+ const T x2(value2);
+ if (isapplicable(ctype<T>, value1, value2))
+ {
+ CHECK(std::is_same<decltype(fn(x1, x2)),
+ typename compound_type_traits<T>::template rebind<decltype(reffn(
+ std::declval<subtype<T>>(), std::declval<subtype<T>>()))>>::value);
+ CHECK(fn(x1, x2) == apply(reffn, x1, x2));
+ }
+ });
+
+ testo::matrix(named("type") = test_catogories::types(cint<Cat & ~1>), [&](auto type) {
+ using T = type_of<decltype(type)>;
+ const T x1 = test_enumerate(T::shape(), csizeseq<T::size()>, 0, 1);
+ const T x2 = test_enumerate(T::shape(), csizeseq<T::size()>, 100, -1);
+ CHECK(fn(x1, x2) == apply(reffn, x1, x2));
+ });
+}
+
+#endif
+
+namespace internal
+{
+// vector<vector> to vector<vector>
+template <typename To, typename From, size_t N1, size_t N2, size_t Ns1>
+struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>>
+{
+ static_assert(N1 == Ns1, "");
+ static_assert(!is_compound<To>::value, "");
+ static_assert(!is_compound<From>::value, "");
+ static vec<vec<To, N1>, N2> cast(const vec<From, N1>& value)
+ {
+ return vec<vec<To, N1>, N2>::from_flatten(
+ kfr::innercast<To>(value.flatten())
+ .shuffle(csizeseq<N2 * vec<From, N1>::scalar_size()> % csize<N2>));
+ }
+};
+// vector<vector> to vector<vector>
+template <typename To, typename From, size_t N1, size_t N2, size_t NN1, size_t NN2>
+struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>>
+{
+ static_assert(N1 == NN1, "");
+ static_assert(N2 == NN2, "");
+ static_assert(!is_compound<To>::value, "");
+ static_assert(!is_compound<From>::value, "");
+ static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
+ {
+ return vec<vec<To, N1>, N2>::from_flatten(kfr::innercast<To>(value.flatten()));
+ }
+};
+} // namespace internal
+
+template <typename T, size_t N1, size_t N2 = N1>
+using mat = vec<vec<T, N1>, N2>;
+
+using u8x2x2 = vec<vec<u8, 2>, 2>;
+using i8x2x2 = vec<vec<i8, 2>, 2>;
+using u16x2x2 = vec<vec<u16, 2>, 2>;
+using i16x2x2 = vec<vec<i16, 2>, 2>;
+using u32x2x2 = vec<vec<u32, 2>, 2>;
+using i32x2x2 = vec<vec<i32, 2>, 2>;
+using u64x2x2 = vec<vec<u64, 2>, 2>;
+using i64x2x2 = vec<vec<i64, 2>, 2>;
+using f32x2x2 = vec<vec<f32, 2>, 2>;
+using f64x2x2 = vec<vec<f64, 2>, 2>;
+
+using u8x4x4 = vec<vec<u8, 4>, 4>;
+using i8x4x4 = vec<vec<i8, 4>, 4>;
+using u16x4x4 = vec<vec<u16, 4>, 4>;
+using i16x4x4 = vec<vec<i16, 4>, 4>;
+using u32x4x4 = vec<vec<u32, 4>, 4>;
+using i32x4x4 = vec<vec<i32, 4>, 4>;
+using u64x4x4 = vec<vec<u64, 4>, 4>;
+using i64x4x4 = vec<vec<i64, 4>, 4>;
+using f32x4x4 = vec<vec<f32, 4>, 4>;
+using f64x4x4 = vec<vec<f64, 4>, 4>;
+
+template <size_t N1, size_t N2>
+struct vec_vec_template
+{
+ template <typename T>
+ using type = vec<vec<T, N1>, N2>;
+};
+
+} // namespace CMT_ARCH_NAME
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<T1, N>, kfr::vec<T2, N>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<kfr::vec<T1, N>, T2>
+ : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type>
+{
+};
+template <typename T1, typename T2, size_t N>
+struct common_type_impl<T1, kfr::vec<T2, N>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_template<N>::template type>
+{
+};
+
+template <typename T1, typename T2, size_t N1, size_t N2>
+struct common_type_impl<kfr::vec<T1, N1>, kfr::vec<kfr::vec<T2, N1>, N2>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_vec_template<N1, N2>::template type>
+{
+ using type = kfr::vec<kfr::vec<typename common_type_impl<T1, T2>::type, N1>, N2>;
+};
+template <typename T1, typename T2, size_t N1, size_t N2>
+struct common_type_impl<kfr::vec<kfr::vec<T1, N1>, N2>, kfr::vec<T2, N1>>
+ : common_type_from_subtypes<T1, T2, kfr::vec_vec_template<N1, N2>::template type>
+{
+};
+} // namespace kfr
+
+namespace cometa
+{
+
+template <typename T, size_t N>
+struct compound_type_traits<kfr::vec_shape<T, N>>
+{
+ constexpr static size_t width = N;
+ constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+ using subtype = T;
+ using deep_subtype = cometa::deep_subtype<T>;
+ constexpr static bool is_scalar = false;
+ constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
+
+ template <typename U>
+ using rebind = kfr::vec_shape<U, N>;
+ template <typename U>
+ using deep_rebind = kfr::vec_shape<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
+};
+
+template <typename T, size_t N>
+struct compound_type_traits<kfr::vec<T, N>>
+{
+ using subtype = T;
+ using deep_subtype = cometa::deep_subtype<T>;
+ constexpr static size_t width = N;
+ constexpr static size_t deep_width = width * compound_type_traits<T>::width;
+ constexpr static bool is_scalar = false;
+ constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
+ template <typename U>
+ using rebind = kfr::vec<U, N>;
+ template <typename U>
+ using deep_rebind = kfr::vec<typename compound_type_traits<subtype>::template deep_rebind<U>, N>;
+
+ KFR_MEM_INTRINSIC static constexpr subtype at(const kfr::vec<T, N>& value, size_t index)
+ {
+ return value[index];
+ }
+};
+
+namespace details
+{
+template <typename T, size_t N>
+struct flt_type_impl<kfr::vec<T, N>>
+{
+ using type = kfr::vec<typename flt_type_impl<T>::type, N>;
+};
+} // namespace details
+} // namespace cometa
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+CMT_PRAGMA_MSVC(warning(pop))
diff --git a/include/kfr/testo/assert.hpp b/include/kfr/testo/assert.hpp
@@ -1,4 +1,7 @@
-#pragma once
+/** @addtogroup testo
+ * @{
+ */
+#pragma once
#include "comparison.hpp"
diff --git a/include/kfr/testo/comparison.hpp b/include/kfr/testo/comparison.hpp
@@ -1,4 +1,7 @@
-#pragma once
+/** @addtogroup testo
+ * @{
+ */
+#pragma once
#include "../cometa/tuple.hpp"
@@ -26,7 +29,7 @@ struct comparison
R right;
Fn cmp;
- comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)) {}
+ comparison(L&& left, R&& right) : left(std::forward<L>(left)), right(std::forward<R>(right)), cmp() {}
bool operator()() const { return cmp(left, right); }
};
@@ -53,28 +56,51 @@ CMT_PRAGMA_GNU(GCC diagnostic push)
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wfloat-equal")
template <typename T>
-inline T& epsilon()
+inline T& current_epsilon()
{
static T value = std::numeric_limits<T>::epsilon();
return value;
}
+template <typename T>
+struct eplison_scope
+{
+ eplison_scope(T scale) { current_epsilon<T>() = std::numeric_limits<T>::epsilon() * scale; }
+ ~eplison_scope() { current_epsilon<T>() = saved; }
+ T saved = current_epsilon<T>();
+};
+
+template <>
+struct eplison_scope<void>
+{
+ eplison_scope(float scale) : f(scale), d(scale), ld(scale) {}
+ eplison_scope<float> f;
+ eplison_scope<double> d;
+ eplison_scope<long double> ld;
+};
+
template <>
struct equality_comparer<float, float>
{
- bool operator()(const float& l, const float& r) const { return !(std::abs(l - r) > epsilon<float>()); }
+ bool operator()(const float& l, const float& r) const
+ {
+ return !(std::abs(l - r) > current_epsilon<float>());
+ }
};
template <>
struct equality_comparer<double, double>
{
- bool operator()(const double& l, const double& r) const { return !(std::abs(l - r) > epsilon<double>()); }
+ bool operator()(const double& l, const double& r) const
+ {
+ return !(std::abs(l - r) > current_epsilon<double>());
+ }
};
template <>
struct equality_comparer<long double, long double>
{
bool operator()(const long double& l, const long double& r) const
{
- return !(std::abs(l - r) > epsilon<long double>());
+ return !(std::abs(l - r) > current_epsilon<long double>());
}
};
diff --git a/include/kfr/testo/console_colors.hpp b/include/kfr/testo/console_colors.hpp
@@ -0,0 +1,166 @@
+#pragma once
+#include <cstdint>
+#include <cstdio>
+
+//#define CONSOLE_COLORS_FORCE_ASCII
+
+#if defined _WIN32 && !defined PRINT_COLORED_FORCE_ASCII
+#define USE_WIN32_API
+#endif
+
+#if defined(USE_WIN32_API)
+
+namespace win32_lite
+{
+typedef void* HANDLE;
+typedef uint32_t DWORD;
+
+#define WIN32_LITE_STD_INPUT_HANDLE ((win32_lite::DWORD)-10)
+#define WIN32_LITE_STD_OUTPUT_HANDLE ((win32_lite::DWORD)-11)
+#define WIN32_LITE_STD_ERROR_HANDLE ((win32_lite::DWORD)-12)
+
+#define WIN32_LITE_ENABLE_VIRTUAL_TERMINAL_PROCESSING (4)
+
+#define WIN32_LITE_DECLSPEC_IMPORT __declspec(dllimport)
+
+#define WIN32_LITE_WINAPI __stdcall
+
+typedef short SHORT;
+typedef unsigned short WORD;
+typedef int WINBOOL;
+
+extern "C"
+{
+ WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI GetConsoleMode(HANDLE hConsole, DWORD* dwMode);
+ WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleMode(HANDLE hConsole, DWORD dwMode);
+ WIN32_LITE_DECLSPEC_IMPORT HANDLE WIN32_LITE_WINAPI GetStdHandle(DWORD nStdHandle);
+ WIN32_LITE_DECLSPEC_IMPORT WINBOOL WIN32_LITE_WINAPI SetConsoleTextAttribute(HANDLE hConsoleOutput,
+ WORD wAttributes);
+}
+} // namespace win32_lite
+
+#endif
+
+namespace console_colors
+{
+
+enum text_color : uint32_t
+{
+ Black = 0x00,
+ DarkBlue = 0x01,
+ DarkGreen = 0x02,
+ DarkCyan = 0x03,
+ DarkRed = 0x04,
+ DarkMagenta = 0x05,
+ DarkYellow = 0x06,
+ LightGrey = 0x07,
+ Gray = 0x08,
+ Blue = 0x09,
+ Green = 0x0A,
+ Cyan = 0x0B,
+ Red = 0x0C,
+ Magenta = 0x0D,
+ Yellow = 0x0E,
+ White = 0x0F,
+ BgBlack = 0x00,
+ BgDarkBlue = 0x10,
+ BgDarkGreen = 0x20,
+ BgDarkCyan = 0x30,
+ BgDarkRed = 0x40,
+ BgDarkMagenta = 0x50,
+ BgDarkYellow = 0x60,
+ BgLightGrey = 0x70,
+ BgGray = 0x80,
+ BgBlue = 0x90,
+ BgGreen = 0xA0,
+ BgCyan = 0xB0,
+ BgRed = 0xC0,
+ BgMagenta = 0xD0,
+ BgYellow = 0xE0,
+ BgWhite = 0xF0,
+
+ Normal = BgBlack | LightGrey
+};
+
+enum console_buffer
+{
+ ConsoleStdOutput,
+ ConsoleStdError
+};
+
+struct console_color
+{
+public:
+ console_color(text_color c, console_buffer console = ConsoleStdOutput)
+ : m_old(get(console)), m_console(console)
+ {
+ set(c, m_console);
+ }
+
+ ~console_color() { set(m_old, m_console); }
+
+private:
+ text_color get(console_buffer = ConsoleStdOutput) { return saved_color(); }
+
+ void set(text_color new_color, console_buffer console = ConsoleStdOutput)
+ {
+#ifdef USE_WIN32_API
+ win32_lite::SetConsoleTextAttribute(win32_lite::GetStdHandle(console == ConsoleStdOutput
+ ? WIN32_LITE_STD_OUTPUT_HANDLE
+ : WIN32_LITE_STD_ERROR_HANDLE),
+ static_cast<win32_lite::WORD>(new_color));
+#else
+ if (new_color != Normal)
+ {
+ uint8_t t = new_color & 0xF;
+ uint8_t b = (new_color & 0xF0) >> 4;
+ uint8_t tnum = 30 + ((t & 1) << 2 | (t & 2) | (t & 4) >> 2);
+ uint8_t bnum = 40 + ((b & 1) << 2 | (b & 2) | (b & 4) >> 2);
+ if (t & 8)
+ tnum += 60;
+ if (b & 8)
+ bnum += 60;
+ std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[%d;%dm", tnum, bnum);
+ }
+ else
+ {
+ std::fprintf(console == ConsoleStdOutput ? stdout : stderr, "\x1B[0m");
+ }
+#endif
+ saved_color() = new_color;
+ }
+
+ text_color m_old;
+ console_buffer m_console;
+ static text_color& saved_color()
+ {
+ static text_color color = Normal;
+ return color;
+ }
+};
+
+template <text_color color, console_buffer console = ConsoleStdOutput>
+struct console_color_tpl : public console_color
+{
+public:
+ console_color_tpl() : console_color(color, console) {}
+
+private:
+};
+
+typedef console_color_tpl<DarkBlue> darkblue_text;
+typedef console_color_tpl<DarkGreen> darkgreen_text;
+typedef console_color_tpl<DarkCyan> darkcyan_text;
+typedef console_color_tpl<DarkRed> darkred_text;
+typedef console_color_tpl<DarkMagenta> darkmagenta_text;
+typedef console_color_tpl<DarkYellow> darkyellow_text;
+typedef console_color_tpl<LightGrey> lightgrey_text;
+typedef console_color_tpl<Gray> gray_text;
+typedef console_color_tpl<Blue> blue_text;
+typedef console_color_tpl<Green> green_text;
+typedef console_color_tpl<Cyan> cyan_text;
+typedef console_color_tpl<Red> red_text;
+typedef console_color_tpl<Magenta> magenta_text;
+typedef console_color_tpl<Yellow> yellow_text;
+typedef console_color_tpl<White> white_text;
+} // namespace console_colors
diff --git a/include/kfr/testo/double_double.hpp b/include/kfr/testo/double_double.hpp
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <algorithm>
+#include <bitset>
+#include <cmath>
+#include <cstring>
+
+struct precise_fp
+{
+ int sign; // 1 means '+', -1 means '-', can't be 0
+ int exponent; // unbiased, INT_MIN means 0/denormal, INT_MAX means inf/nan
+ uint64_t mantissa; // with explicit first bit set, 63 significant bits
+
+ bool is_zero() const { return exponent == INT_MIN && mantissa == 0; }
+ bool is_denormal() const { return exponent == INT_MIN && mantissa != 0; }
+ bool is_inf() const { return exponent == INT_MAX && mantissa == 0; }
+ bool is_nan() const { return exponent == INT_MAX && mantissa != 0; }
+
+ double to_double() const { return sign * std::ldexp(static_cast<double>(mantissa), exponent); }
+ float to_float() const { return sign * std::ldexp(static_cast<float>(mantissa), exponent); }
+
+ precise_fp(int sign, int exponent, uint64_t mantissa) : sign(sign), exponent(exponent), mantissa(mantissa)
+ {
+ }
+
+ template <typename T>
+ explicit precise_fp(T value)
+ {
+ sign = static_cast<int>(std::copysign(T(1), value));
+ if (value == 0)
+ {
+ mantissa = 0;
+ exponent = INT_MIN;
+ }
+ else if (std::isinf(value))
+ {
+ mantissa = 0;
+ exponent = INT_MAX;
+ }
+ else if (std::isnan(value))
+ {
+ mantissa = 1;
+ exponent = INT_MAX;
+ }
+ else
+ {
+ mantissa = 0x80000000'00000000ull * std::frexp(value, &exponent);
+ }
+ }
+
+ friend double precise_ulps(const precise_fp& x, const float& y)
+ {
+ return precise_ulps(x, precise_fp(y), -126, 24);
+ }
+ friend double precise_ulps(const precise_fp& x, const double& y)
+ {
+ return precise_ulps(x, precise_fp(y), -1022, 53);
+ }
+
+ friend double precise_ulps(const precise_fp& x, const precise_fp& y, int minexponent, int mantissabits)
+ {
+ if (x.is_zero() && y.is_zero())
+ return 0;
+ if (x.is_nan() && y.is_nan())
+ return 0;
+ if (x.is_inf() && y.is_inf())
+ return x.sign == y.sign ? 0 : HUGE_VAL;
+ if (x.is_zero() && y.is_zero())
+ return 0;
+
+ if (x.sign != y.sign)
+ return HUGE_VAL;
+ uint64_t xx = x.mantissa;
+ uint64_t yy = y.mantissa;
+ const int minexp = std::min(x.exponent, y.exponent);
+ if (x.exponent - minexp <= 1 && y.exponent - minexp <= 1)
+ {
+ xx >>= y.exponent - minexp;
+ yy >>= x.exponent - minexp;
+ return static_cast<double>(xx > yy ? xx - yy : yy - xx) / (1 << (63 - mantissabits));
+ }
+ return HUGE_VAL;
+ }
+};
+
+struct double_double
+{
+ double hi, lo;
+
+ static_assert(sizeof(double) == 8, "");
+
+ constexpr double_double(double x) noexcept : hi(x), lo(0.0) {}
+ constexpr double_double(float x) noexcept : hi(x), lo(0.0) {}
+ constexpr double_double(double hi, double lo) noexcept : hi(hi + lo), lo((hi - (hi + lo)) + lo) {}
+ constexpr operator double() const noexcept { return hi + lo; }
+ constexpr operator float() const noexcept { return hi + lo; }
+
+ constexpr static double abs(double x) noexcept { return x >= 0 ? x : -x; }
+
+ constexpr friend double_double operator-(const double_double& x) noexcept { return { -x.hi, -x.lo }; }
+ constexpr friend double_double operator+(const double_double& x, const double_double& y) noexcept
+ {
+ const double sum = x.hi + y.hi;
+ return { sum, abs(x.hi) > abs(y.hi) ? (((x.hi - sum) + y.hi) + y.lo) + x.lo
+ : (((y.hi - sum) + x.hi) + x.lo) + y.lo };
+ }
+ constexpr friend double_double operator-(const double_double& x, const double_double& y) noexcept
+ {
+ const double diff = x.hi - y.hi;
+ return { diff, abs(x.hi) > abs(y.hi) ? (((x.hi - diff) - y.hi) - y.lo) + x.lo
+ : (((-y.hi - diff) + x.hi) + x.lo) - y.lo };
+ }
+ constexpr friend double_double operator*(const double_double& x, const double_double& y) noexcept
+ {
+ const double_double c = mul(x.hi, y.hi);
+ const double cc = (x.hi * y.lo + x.lo * y.hi) + c.lo;
+ return { c.hi, cc };
+ }
+ constexpr friend double_double operator/(const double_double& x, const double_double& y) noexcept
+ {
+ const double c = x.hi / y.hi;
+ const double_double u = mul(c, y.hi);
+ const double cc = ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) / y.hi;
+ return { c, cc };
+ }
+
+ bool isinf() const noexcept { return std::isinf(hi); }
+ bool isnan() const noexcept { return std::isnan(hi) || std::isnan(lo); }
+ bool iszero() const noexcept { return hi == 0 && lo == 0; }
+
+ double ulp(float value) const noexcept
+ {
+ if (std::isnan(value) && isnan())
+ return 0.0;
+ if (std::isinf(value) && isinf() && (std::copysign(1.0f, value) == std::copysign(1.0, hi)))
+ return 0.0;
+ if (value == 0 && iszero())
+ return 0.0;
+ if (std::nexttoward(value, 0.0) == 0.0 && iszero())
+ return 1.0;
+ return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
+ }
+ double ulp(double value) const noexcept
+ {
+ if (std::isnan(value) && isnan())
+ return 0.0;
+ if (std::isinf(value) && isinf() && (std::copysign(1.0, value) == std::copysign(1.0, hi)))
+ return 0.0;
+ if (value == 0 && iszero())
+ return 0.0;
+ if (std::nexttoward(value, 0.0) == 0.0 && iszero())
+ return 1.0;
+ return (double_double(value) - *this) / double_double(std::nexttoward(value, 0.0));
+ }
+
+private:
+ constexpr static double_double splitprec(double x) noexcept
+ {
+ const double p = x * 1.34217729e8;
+ const double h = (x - p) + p;
+ return { h, x - h };
+ }
+ constexpr static double_double mul(double x, double y) noexcept
+ {
+ const double_double xx = splitprec(x);
+ const double_double yy = splitprec(y);
+ const double z = x * y;
+ return { z, ((xx.hi * yy.hi - z) + xx.hi * yy.lo + xx.lo * yy.hi) + xx.lo * yy.lo };
+ }
+};
diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp
@@ -1,4 +1,7 @@
-#pragma once
+/** @addtogroup testo
+ * @{
+ */
+#pragma once
#include "comparison.hpp"
@@ -12,7 +15,8 @@
#include <mpfr/mpfr.hpp>
#include <mpfr/mpfr_tostring.hpp>
#endif
-#include "../ext/console_colors.hpp"
+#include "console_colors.hpp"
+#include <cassert>
#include <chrono>
#include <cmath>
@@ -21,6 +25,7 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpragmas")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wexit-time-destructors")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpadded")
CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses")
namespace testo
{
@@ -101,6 +106,15 @@ inline test_case*& active_test()
return instance;
}
+struct scope
+{
+ std::string text;
+ test_case* current_test;
+ scope* parent;
+ scope(std::string text);
+ ~scope();
+};
+
struct test_case
{
using test_func = void (*)();
@@ -155,12 +169,14 @@ struct test_case
}
console_color cc(White);
}
+ subtests.clear();
return !failed;
}
void check(bool result, const std::string& value, const char* expr)
{
- subtests.push_back(subtest{ result, as_string(padleft(22, expr), " | ", value), comment });
+ subtests.push_back(
+ subtest{ result, as_string(padleft(22, expr), " | ", value), current_scope_text() });
result ? success++ : failed++;
if (show_progress)
{
@@ -191,43 +207,59 @@ struct test_case
check(result, as_string(comparison.left), expr);
}
- void append_comment(const std::string& text)
+ struct subtest
+ {
+ bool success;
+ std::string text;
+ std::string comment;
+ };
+
+ void scope_changed()
{
- comment += text;
if (show_progress)
{
println();
- println(text, ":");
+ println(current_scope_text(), ":");
}
}
-
- void set_comment(const std::string& text)
+ std::string current_scope_text() const
{
- comment = text;
- if (show_progress)
+ scope* s = this->current_scope;
+ std::string result;
+ while (s)
{
- println();
- println(text, ":");
+ if (!result.empty())
+ result = "; " + result;
+ result = s->text + result;
+ s = s->parent;
}
+ return result;
}
- struct subtest
- {
- bool success;
- std::string text;
- std::string comment;
- };
-
test_func func;
const char* name;
std::vector<subtest> subtests;
- std::string comment;
int success;
int failed;
double time;
bool show_progress;
+ scope* current_scope = nullptr;
};
+inline scope::scope(std::string text)
+ : text(std::move(text)), current_test(active_test()), parent(current_test->current_scope)
+{
+ current_test->current_scope = this;
+ current_test->scope_changed();
+}
+
+inline scope::~scope()
+{
+ assert(active_test() == current_test);
+ assert(current_test->current_scope == this);
+ current_test->current_scope = parent;
+}
+
template <typename Number>
struct statistics
{
@@ -267,10 +299,10 @@ template <typename Arg0, typename Fn>
void matrix(named_arg<Arg0>&& arg0, Fn&& fn)
{
cforeach(std::forward<Arg0>(arg0.value), [&](auto v0) {
- active_test()->set_comment(as_string(arg0.name, " = ", v0));
+ scope s(as_string(arg0.name, " = ", v0));
fn(v0);
});
- if (active_test()->show_progress)
+ if (active_test() && active_test()->show_progress)
println();
}
@@ -278,7 +310,7 @@ template <typename Arg0, typename Arg1, typename Fn>
void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, Fn&& fn)
{
cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), [&](auto v0, auto v1) {
- active_test()->set_comment(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1));
+ scope s(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1));
fn(v0, v1);
});
if (active_test()->show_progress)
@@ -290,7 +322,7 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar
{
cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value),
[&](auto v0, auto v1, auto v2) {
- active_test()->set_comment(
+ scope s(
as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ", v2));
fn(v0, v1, v2);
});
@@ -298,27 +330,53 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar
println();
}
+template <typename Arg0, typename Arg1, typename Arg2, typename Arg3, typename Fn>
+void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& arg2, named_arg<Arg3>&& arg3,
+ Fn&& fn)
+{
+ cforeach(std::forward<Arg0>(arg0.value), std::forward<Arg1>(arg1.value), std::forward<Arg2>(arg2.value),
+ std::forward<Arg3>(arg3.value), [&](auto v0, auto v1, auto v2, auto v3) {
+ scope s(as_string(arg0.name, " = ", v0, ", ", arg1.name, " = ", v1, ", ", arg2.name, " = ",
+ v2, arg3.name, " = ", v3));
+ fn(v0, v1, v2, v3);
+ });
+ if (active_test()->show_progress)
+ println();
+}
+
CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show_successful = false)
{
std::vector<test_case*> success;
std::vector<test_case*> failed;
+ int success_checks = 0;
+ int failed_checks = 0;
for (test_case* t : test_case::tests())
{
if (name.empty() || t->name == name)
+ {
t->run(show_successful) ? success.push_back(t) : failed.push_back(t);
+ success_checks += t->success;
+ failed_checks += t->failed;
+ }
}
printfmt("{}\n", std::string(79, '='));
if (!success.empty())
{
console_color cc(Green);
printfmt("[{}]", padcenter(11, "SUCCESS", '-'));
- printfmt(" {} tests\n", success.size());
+ printfmt(" {}/{} tests {}/{} checks\n", success.size(), success.size() + failed.size(),
+ success_checks, success_checks + failed_checks);
}
if (!failed.empty())
{
console_color cc(Red);
printfmt("[{}]", padcenter(11, "ERROR", '-'));
- printfmt(" {} tests\n", failed.size());
+ printfmt(" {}/{} tests {}/{} checks\n", failed.size(), success.size() + failed.size(), failed_checks,
+ success_checks + failed_checks);
+ for (test_case* t : failed)
+ {
+ print(" ", t->name, "\n");
+ }
}
return static_cast<int>(failed.size());
}
@@ -334,6 +392,13 @@ void assert_is_same_decay()
static_assert(std::is_same<cometa::decay<T1>, cometa::decay<T2>>::value, "");
}
+template <typename T, size_t NArgs>
+struct test_data_entry
+{
+ T arguments[NArgs];
+ T result;
+};
+
#define TESTO_CHECK(...) \
do \
{ \
@@ -354,6 +419,7 @@ void assert_is_same_decay()
#define TEST TESTO_TEST
#define DTEST TESTO_DTEST
#endif
+
} // namespace testo
CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/include/kfr/version.hpp b/include/kfr/version.hpp
@@ -25,8 +25,7 @@
*/
#pragma once
-#include "base/types.hpp"
-#include "cpuid/cpuid_auto.hpp"
+#include "runtime/cpuid_auto.hpp"
namespace kfr
{
diff --git a/sources.cmake b/sources.cmake
@@ -7,99 +7,52 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/all.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/cpuid.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp.hpp
${PROJECT_SOURCE_DIR}/include/kfr/io.hpp
${PROJECT_SOURCE_DIR}/include/kfr/math.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/runtime.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd.hpp
${PROJECT_SOURCE_DIR}/include/kfr/version.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cident.h
- ${PROJECT_SOURCE_DIR}/include/kfr/base/abs.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/asin_acos.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/atan.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/kfr.h
${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/bitwise.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/clamp.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/comparison.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/compiletime.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/complex.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/constants.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/digitreverse.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/filter.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/fraction.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/function.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/gamma.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/base/function_expressions.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/generators.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/horizontal.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/hyperbolic.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/logical.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/log_exp.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/memory.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/min_max.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/modzerobessel.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/operators.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/platform.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/pointer.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/random.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/read_write.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/reduce.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/round.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/saturation.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/select.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/shuffle.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_clang.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_intrin.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/simd_x86.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/sin_cos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/sort.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/sqrt.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/tan.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/types.hpp
${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/vec.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/intrinsics.h
- ${PROJECT_SOURCE_DIR}/include/kfr/base/kfr.h
- ${PROJECT_SOURCE_DIR}/include/kfr/base/specializations.i
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/abs.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/asin_acos.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/atan.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/clamp.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/gamma.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/hyperbolic.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/logical.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/log_exp.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/min_max.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/modzerobessel.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/round.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/saturation.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/select.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/sin_cos.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/sqrt.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/tan.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/array.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/cstring.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/ctti.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/function.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/named_arg.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/cometa/numeric.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/range.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/result.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/string.hpp
${PROJECT_SOURCE_DIR}/include/kfr/cometa/tuple.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/cpuid/cpuid.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/cpuid/cpuid_auto.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/data/bitrev.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/data/sincos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/cache.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/convolution.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/fft.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/reference_dft.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/dft_c.h
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/bitrev.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/sincos.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/bitrev.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-fft.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-templates.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-templates.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/ft.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad_design.hpp
@@ -110,7 +63,6 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/dsp/fir_design.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/fracdelay.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/goertzel.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/dsp/interpolation.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/mixdown.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/oscillators.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/sample_rate_conversion.hpp
@@ -120,15 +72,114 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/dsp/waveshaper.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/weighting.hpp
${PROJECT_SOURCE_DIR}/include/kfr/dsp/window.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/ext/console_colors.hpp
- ${PROJECT_SOURCE_DIR}/include/kfr/ext/double_double.hpp
${PROJECT_SOURCE_DIR}/include/kfr/io/audiofile.hpp
${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp
${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp
${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp
${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h
${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/abs.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/clamp.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/complex_math.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/gamma.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/hyperbolic.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/interpolation.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/logical.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/log_exp.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/min_max.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/modzerobessel.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/round.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/saturation.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/select.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/sin_cos.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/sqrt.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/tan.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/abs.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/asin_acos.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/atan.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/clamp.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/gamma.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/hyperbolic.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/logical.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/log_exp.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/min_max.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/modzerobessel.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/round.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/saturation.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/select.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sin_cos.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/sqrt.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/math/impl/tan.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/runtime/cpuid_auto.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/comparison.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/complex.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/constants.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/digitreverse.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/horizontal.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/mask.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/operators.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/platform.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/read_write.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/shuffle.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/types.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/vec.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_clang.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/backend_generic.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_clang.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i
${PROJECT_SOURCE_DIR}/include/kfr/testo/assert.hpp
${PROJECT_SOURCE_DIR}/include/kfr/testo/comparison.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/testo/console_colors.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/testo/double_double.hpp
${PROJECT_SOURCE_DIR}/include/kfr/testo/testo.hpp
)
+
+
+set(
+ KFR_DFT_SRC
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-src.cpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f32.cpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f64.cpp
+)
+
+
+set(
+ KFR_IO_SRC
+ ${PROJECT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp
+)
+
+
+set(
+ KFR_UNITTEST_SRC
+ ${PROJECT_SOURCE_DIR}/tests/unit/base/conversion.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/base/reduce.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/abs.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/asin_acos.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/atan.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/hyperbolic.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/log_exp.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/min_max.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/round.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/select.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/sin_cos.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/math/tan.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/complex.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/operators.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/shuffle.cpp
+ ${PROJECT_SOURCE_DIR}/tests/unit/simd/vec.cpp
+)
+
+
+\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -15,12 +15,26 @@
# along with KFR.
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
add_definitions(-DKFR_TESTING=1)
+add_definitions(-DKFR_SRC_DIR=\"${CMAKE_SOURCE_DIR}\")
+
+# Binary output directories
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
+if (ENABLE_ASMTEST)
+ add_executable(asm_test asm_test.cpp)
+ target_link_libraries(asm_test kfr)
+ target_set_arch(asm_test PRIVATE avx)
+ target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED)
+
+ add_custom_command(TARGET asm_test POST_BUILD COMMAND objconv -fyasm $<TARGET_FILE:asm_test>)
+endif()
+
if (NOT ARM)
if(MSVC AND NOT CLANG)
add_executable(multiarch multiarch.cpp multiarch_fir_sse2.cpp multiarch_fir_avx.cpp)
@@ -34,67 +48,96 @@ if (NOT ARM)
target_link_libraries(multiarch kfr)
endif ()
-find_package(MPFR)
-find_package(GMP)
-
set(ALL_TESTS_CPP
- all_tests.cpp
- base_test.cpp
- complex_test.cpp
- dsp_test.cpp
- expression_test.cpp
- intrinsic_test.cpp
- io_test.cpp
- resampler_test.cpp)
+ base_test.cpp
+ complex_test.cpp
+ dsp_test.cpp
+ expression_test.cpp
+ intrinsic_test.cpp
+ io_test.cpp
+ ${KFR_UNITTEST_SRC})
+
+# set(ALL_TESTS_MERGED_CPP all_tests_merged.cpp)
if (ENABLE_DFT)
list(APPEND ALL_TESTS_CPP dft_test.cpp)
endif ()
+find_package(MPFR)
+find_package(GMP)
+
if (MPFR_FOUND AND GMP_FOUND)
- list(APPEND ALL_TESTS_CPP transcendental_test.cpp)
-else ()
- message(STATUS "MPFR is not found. Skipping transcendental_test")
+ message(STATUS "MPFR is found")
+ add_executable(generate_data generate_data.cpp)
+ target_link_libraries(generate_data kfr)
+ target_include_directories(generate_data PRIVATE ${MPFR_INCLUDE_DIR} ${GMP_INCLUDE_DIR})
+ target_link_libraries(generate_data ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
+ if (REGENERATE_TESTS)
+ add_custom_command(TARGET generate_data POST_BUILD
+ COMMENT "Generating tests..."
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests/data
+ COMMAND generate_data)
+ endif ()
endif ()
-add_executable(all_tests ${ALL_TESTS_CPP})
+add_executable(all_tests all_tests.cpp ${ALL_TESTS_CPP})
target_compile_definitions(all_tests PRIVATE KFR_NO_MAIN)
+target_link_libraries(all_tests kfr use_arch)
if (ENABLE_DFT)
- target_link_libraries(all_tests kfr kfr_dft)
+ target_link_libraries(all_tests kfr_dft)
endif ()
-target_link_libraries(all_tests kfr kfr_io)
+target_link_libraries(all_tests kfr_io)
-if (MPFR_FOUND AND GMP_FOUND)
- add_definitions(-DHAVE_MPFR)
- include_directories(${MPFR_INCLUDE_DIR} ${GMP_INCLUDE_DIR})
- target_link_libraries(all_tests ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
-endif ()
+function(add_x86_test ARCH)
+ set(NAME ${ARCH})
-function(add_x86_test NAME FLAGS)
- separate_arguments(FLAGS)
- add_executable(all_tests_${NAME} ${ALL_TESTS_CPP} ${KFR_IO_SRC})
+ add_executable(all_tests_${NAME} all_tests.cpp ${ALL_TESTS_CPP} ${KFR_IO_SRC})
if (ENABLE_DFT)
target_sources(all_tests_${NAME} PRIVATE ${KFR_DFT_SRC})
endif ()
- target_compile_options(all_tests_${NAME} PRIVATE ${FLAGS})
- target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN)
target_link_libraries(all_tests_${NAME} kfr)
+ target_set_arch(all_tests_${NAME} PRIVATE ${ARCH})
+ target_compile_definitions(all_tests_${NAME} PRIVATE KFR_NO_MAIN)
target_compile_definitions(all_tests_${NAME} PUBLIC KFR_ENABLE_FLAC=1)
- if (MPFR_FOUND AND GMP_FOUND)
- target_link_libraries(all_tests_${NAME} ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
+
+ if (ARCH_TESTS_MULTI)
+ add_library(all_tests_multiarch_${NAME} STATIC ${ALL_TESTS_MERGED_CPP} ${KFR_IO_SRC})
+ if (ENABLE_DFT)
+ target_sources(all_tests_multiarch_${NAME} PRIVATE ${KFR_DFT_SRC})
+ endif ()
+ target_link_libraries(all_tests_multiarch_${NAME} kfr)
+ target_set_arch(all_tests_multiarch_${NAME} PRIVATE ${ARCH})
+ target_compile_definitions(all_tests_multiarch_${NAME} PRIVATE KFR_NO_MAIN)
+ target_compile_definitions(all_tests_multiarch_${NAME} PUBLIC KFR_ENABLE_FLAC=1)
endif ()
+
endfunction()
if (ARCH_TESTS)
- set (ARCH_RESET "-march=x86-64 -mno-sse3 -mno-ssse3 -mno-sse4.1 -mno-sse4.2 -mno-avx -mno-avx2 -mno-fma -mno-avx512f -mno-avx512cd -mno-avx512bw -mno-avx512dq -mno-avx512vl")
- add_x86_test(generic "${ARCH_RESET} -DCMT_FORCE_GENERIC_CPU")
- add_x86_test(sse2 "${ARCH_RESET} -msse2")
- add_x86_test(sse3 "${ARCH_RESET} -msse3 -mno-avx")
- add_x86_test(ssse3 "${ARCH_RESET} -mssse3 -mno-avx")
- add_x86_test(sse41 "${ARCH_RESET} -msse4.1 -mno-avx")
- add_x86_test(avx "${ARCH_RESET} -msse4.1 -mavx")
- add_x86_test(avx2 "${ARCH_RESET} -msse4.1 -mavx2 -mfma")
- add_x86_test(avx512 "${ARCH_RESET} -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
+ if (NOT MSVC OR CLANG)
+ add_x86_test(generic)
+ endif ()
+ add_x86_test(sse2)
+ add_x86_test(sse3)
+ add_x86_test(ssse3)
+ add_x86_test(sse41)
+ add_x86_test(avx)
+ add_x86_test(avx2)
+ add_x86_test(avx512)
+
+ if (ARCH_TESTS_MULTI)
+ add_executable(all_tests_multiarch all_tests.cpp)
+ target_compile_definitions(all_tests_multiarch PRIVATE KFR_MULTI_ARCH)
+ target_link_libraries(all_tests_multiarch
+ all_tests_multiarch_sse2
+ all_tests_multiarch_sse3
+ all_tests_multiarch_ssse3
+ all_tests_multiarch_sse41
+ all_tests_multiarch_avx
+ all_tests_multiarch_avx2
+ all_tests_multiarch_avx512
+ )
+ endif ()
endif()
if(USE_SDE)
diff --git a/tests/all_tests.cpp b/tests/all_tests.cpp
@@ -7,6 +7,24 @@
using namespace kfr;
+#ifdef KFR_MULTI_ARCH
+
+#define FORCE_LINK(arch) \
+ namespace arch \
+ { \
+ extern void force_link(); \
+ void (*p)() = &force_link; \
+ }
+
+FORCE_LINK(sse2)
+FORCE_LINK(sse3)
+FORCE_LINK(ssse3)
+FORCE_LINK(sse41)
+FORCE_LINK(avx)
+FORCE_LINK(avx2)
+// FORCE_LINK(avx512)
+#endif
+
int main()
{
println(library_version(), " running on ", cpu_runtime());
@@ -16,7 +34,7 @@ int main()
return -1;
}
#ifdef HAVE_MPFR
- mpfr::scoped_precision p(128);
+ mpfr::scoped_precision p(64);
#endif
return testo::run_all("");
}
diff --git a/tests/all_tests_merged.cpp b/tests/all_tests_merged.cpp
@@ -0,0 +1,25 @@
+#include <kfr/cident.h>
+
+CMT_PRAGMA_GNU(GCC diagnostic push)
+CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wparentheses")
+
+#include "auto_test.cpp"
+
+#include "base_test.cpp"
+#include "complex_test.cpp"
+#include "dsp_test.cpp"
+#include "expression_test.cpp"
+#include "intrinsic_test.cpp"
+#include "io_test.cpp"
+#include "resampler_test.cpp"
+
+#ifndef KFR_NO_DFT
+#include "dft_test.cpp"
+#endif
+
+namespace CMT_ARCH_NAME
+{
+void force_link() {}
+} // namespace CMT_ARCH_NAME
+
+CMT_PRAGMA_GNU(GCC diagnostic pop)
diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp
@@ -0,0 +1,213 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/base.hpp>
+#include <kfr/io.hpp>
+#include <kfr/testo/console_colors.hpp>
+
+using namespace kfr;
+
+#define TEST_ASM_8(fn, ty, MACRO) \
+ MACRO(fn, ty, 1) \
+ MACRO(fn, ty, 2) \
+ MACRO(fn, ty, 4) \
+ MACRO(fn, ty, 8) \
+ MACRO(fn, ty, 16) \
+ MACRO(fn, ty, 32) \
+ MACRO(fn, ty, 64)
+
+#define TEST_ASM_16(fn, ty, MACRO) \
+ MACRO(fn, ty, 1) \
+ MACRO(fn, ty, 2) \
+ MACRO(fn, ty, 4) \
+ MACRO(fn, ty, 8) \
+ MACRO(fn, ty, 16) \
+ MACRO(fn, ty, 32) \
+ MACRO(fn, ty, 64)
+
+#define TEST_ASM_32(fn, ty, MACRO) \
+ MACRO(fn, ty, 1) \
+ MACRO(fn, ty, 2) \
+ MACRO(fn, ty, 4) \
+ MACRO(fn, ty, 8) \
+ MACRO(fn, ty, 16) \
+ MACRO(fn, ty, 32)
+
+#define TEST_ASM_64(fn, ty, MACRO) \
+ MACRO(fn, ty, 1) \
+ MACRO(fn, ty, 2) \
+ MACRO(fn, ty, 4) \
+ MACRO(fn, ty, 8) \
+ MACRO(fn, ty, 16)
+
+#ifdef CMT_COMPILER_MSVC
+#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT
+#else
+#define KFR_PUBLIC CMT_PUBLIC_C
+#endif
+
+#define TEST_ASM_VTY1(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
+
+#define TEST_ASM_VTY1_F(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<flt_type<ty>, n>& r, const vec<ty, n>& x) \
+ { \
+ r = kfr::fn(x); \
+ }
+
+#define TEST_ASM_VTY2(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y) \
+ { \
+ r = kfr::fn(x, y); \
+ } \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__scalar(vec<ty, n>& r, const vec<ty, n>& x, \
+ const ty& y) \
+ { \
+ r = kfr::fn(x, y); \
+ }
+#define TEST_ASM_CMP(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(mask<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y) \
+ { \
+ r = kfr::fn(x, y); \
+ }
+#define TEST_ASM_SHIFT(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, \
+ const vec<utype<ty>, n>& y) \
+ { \
+ r = kfr::fn(x, y); \
+ }
+#define TEST_ASM_SHIFT_SCALAR(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__scalar(vec<ty, n>& r, const vec<ty, n>& x, unsigned y) \
+ { \
+ r = kfr::fn(x, y); \
+ }
+#define TEST_ASM_VTY3(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x, const vec<ty, n>& y, \
+ const vec<ty, n>& z) \
+ { \
+ r = kfr::fn(x, y, z); \
+ }
+
+#define GEN_ty(n, ty) ty(n)
+#define GEN_arg_def(n, ty) ty arg##n
+#define GEN_arg(n, ty) arg##n
+
+#define TEST_ASM_MAKE_VECTOR(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, CMT_GEN_LIST(n, GEN_arg_def, ty)) \
+ { \
+ r = kfr::fn(CMT_GEN_LIST(n, GEN_arg, ty)); \
+ } \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__imm(vec<ty, n>& r) \
+ { \
+ r = kfr::fn(CMT_GEN_LIST(n, GEN_ty, ty)); \
+ }
+
+#define TEST_ASM_BROADCAST(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, ty x) { r = kfr::fn<n>(x); }
+
+#define TEST_ASM_HALF1(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n * 2>& x) { r = kfr::fn(x); }
+
+#define TEST_ASM_DOUBLE2(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x, \
+ const vec<ty, n>& y) \
+ { \
+ r = kfr::fn(x, y); \
+ }
+
+#define TEST_ASM_DOUBLE1(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
+
+#define TEST_ASM_U(fn, MACRO) \
+ TEST_ASM_8(fn, u8, MACRO) \
+ TEST_ASM_16(fn, u16, MACRO) \
+ TEST_ASM_32(fn, u32, MACRO) \
+ TEST_ASM_64(fn, u64, MACRO)
+
+#define TEST_ASM_I(fn, MACRO) \
+ TEST_ASM_8(fn, i8, MACRO) \
+ TEST_ASM_16(fn, i16, MACRO) \
+ TEST_ASM_32(fn, i32, MACRO) \
+ TEST_ASM_64(fn, i64, MACRO)
+
+#define TEST_ASM_F(fn, MACRO) \
+ TEST_ASM_32(fn, f32, MACRO) \
+ TEST_ASM_64(fn, f64, MACRO)
+
+#define TEST_ASM_UI(fn, MACRO) TEST_ASM_U(fn, MACRO) TEST_ASM_I(fn, MACRO)
+
+#define TEST_ASM_UIF(fn, MACRO) TEST_ASM_U(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO)
+
+#define TEST_ASM_IF(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO)
+
+TEST_ASM_UIF(add, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(sub, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(mul, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(bitwiseand, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(equal, TEST_ASM_CMP)
+
+TEST_ASM_IF(abs, TEST_ASM_VTY1)
+
+TEST_ASM_IF(sqrt, TEST_ASM_VTY1_F)
+
+TEST_ASM_IF(neg, TEST_ASM_VTY1)
+
+TEST_ASM_UIF(bitwisenot, TEST_ASM_VTY1)
+
+TEST_ASM_UIF(div, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(bitwiseor, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(bitwisexor, TEST_ASM_VTY2)
+
+TEST_ASM_UIF(notequal, TEST_ASM_CMP)
+
+TEST_ASM_UIF(less, TEST_ASM_CMP)
+
+TEST_ASM_UIF(greater, TEST_ASM_CMP)
+
+TEST_ASM_UIF(lessorequal, TEST_ASM_CMP)
+
+TEST_ASM_UIF(greaterorequal, TEST_ASM_CMP)
+
+TEST_ASM_UIF(low, TEST_ASM_HALF1)
+
+TEST_ASM_UIF(high, TEST_ASM_HALF1)
+
+TEST_ASM_UIF(concat, TEST_ASM_DOUBLE2)
+
+TEST_ASM_UIF(shl, TEST_ASM_SHIFT)
+
+TEST_ASM_UIF(shr, TEST_ASM_SHIFT)
+
+TEST_ASM_UIF(shl, TEST_ASM_SHIFT_SCALAR)
+
+TEST_ASM_UIF(shr, TEST_ASM_SHIFT_SCALAR)
+
+TEST_ASM_UIF(duphalfs, TEST_ASM_DOUBLE1)
+
+TEST_ASM_F(sin, TEST_ASM_VTY1_F)
+
+TEST_ASM_F(cos, TEST_ASM_VTY1_F)
+
+TEST_ASM_UIF(sqr, TEST_ASM_VTY1)
+
+TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR)
+
+TEST_ASM_UIF(broadcast, TEST_ASM_BROADCAST)
+
+namespace kfr
+{
+#ifdef KFR_SHOW_NOT_OPTIMIZED
+CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); }
+#endif
+} // namespace kfr
+
+int main() { println(library_version()); }
diff --git a/tests/base_test.cpp b/tests/base_test.cpp
@@ -6,11 +6,14 @@
#include <kfr/testo/testo.hpp>
-#include <kfr/base.hpp>
#include <kfr/io.hpp>
+#include <kfr/simd.hpp>
using namespace kfr;
+namespace CMT_ARCH_NAME
+{
+
TEST(test_basic)
{
// How to make a vector:
@@ -76,359 +79,20 @@ TEST(test_basic)
CHECK(odd(numbers1) == vec<int, 4>{ 1, 3, 5, 7 });
CHECK(even(numbers2) == vec<int, 4>{ 100, 102, 104, 106 });
- // * The following command pairs are equivalent:
- CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
- CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
-
- CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) ==
- vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
- CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
-
- CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) ==
- vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
- CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) ==
- vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
-
- CHECK(splitpairs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6, 1, 3, 5, 7));
- CHECK(splitpairs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
-
- CHECK(interleavehalfs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 4, 1, 5, 2, 6, 3, 7));
- CHECK(interleavehalfs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
-
CHECK(subadd(pack(0, 1, 2, 3, 4, 5, 6, 7), pack(10, 10, 10, 10, 10, 10, 10, 10)) ==
pack(-10, 11, -8, 13, -6, 15, -4, 17));
CHECK(addsub(pack(0, 1, 2, 3, 4, 5, 6, 7), pack(10, 10, 10, 10, 10, 10, 10, 10)) ==
pack(10, -9, 12, -7, 14, -5, 16, -3));
- CHECK(broadcast<8>(1) == pack(1, 1, 1, 1, 1, 1, 1, 1));
- CHECK(broadcast<8>(1, 2) == pack(1, 2, 1, 2, 1, 2, 1, 2));
- CHECK(broadcast<8>(1, 2, 3, 4) == pack(1, 2, 3, 4, 1, 2, 3, 4));
- CHECK(broadcast<8>(1, 2, 3, 4, 5, 6, 7, 8) == pack(1, 2, 3, 4, 5, 6, 7, 8));
-
- CHECK(even(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6));
- CHECK(odd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 3, 5, 7));
-
- CHECK(even<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5));
- CHECK(odd<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(2, 3, 6, 7));
-
- CHECK(reverse(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(7, 6, 5, 4, 3, 2, 1, 0));
- CHECK(reverse<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(6, 7, 4, 5, 2, 3, 0, 1));
- CHECK(reverse<4>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(4, 5, 6, 7, 0, 1, 2, 3));
-
CHECK(digitreverse4(pack(0.f, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)) ==
pack(0.f, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
- CHECK(dup(pack(0, 1, 2, 3)) == pack(0, 0, 1, 1, 2, 2, 3, 3));
- CHECK(duphalfs(pack(0, 1, 2, 3)) == pack(0, 1, 2, 3, 0, 1, 2, 3));
- CHECK(dupeven(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 0, 2, 2, 4, 4, 6, 6));
- CHECK(dupodd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 1, 3, 3, 5, 5, 7, 7));
-
CHECK(inrange(pack(1, 2, 3), 1, 3) == make_mask<int>(true, true, true));
CHECK(inrange(pack(1, 2, 3), 1, 2) == make_mask<int>(true, true, false));
CHECK(inrange(pack(1, 2, 3), 1, 1) == make_mask<int>(true, false, false));
-
- // * Transpose matrix:
- const auto sixteen = enumerate<float, 16>();
- CHECK(transpose<4>(sixteen) == vec<float, 16>(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
-}
-
-TEST(concat)
-{
- CHECK(concat(vec<f32, 1>{ 1 }, vec<f32, 2>{ 2, 3 }, vec<f32, 1>{ 4 }, vec<f32, 3>{ 5, 6, 7 }) //
- == vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 });
-}
-
-TEST(split)
-{
- vec<f32, 1> a1;
- vec<f32, 2> a23;
- vec<f32, 1> a4;
- vec<f32, 3> a567;
- split(vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }, a1, a23, a4, a567);
- CHECK(a1 == vec<f32, 1>{ 1 });
- CHECK(a23 == vec<f32, 2>{ 2, 3 });
- CHECK(a4 == vec<f32, 1>{ 4 });
- CHECK(a567 == vec<f32, 3>{ 5, 6, 7 });
-}
-
-TEST(broadcast)
-{
- CHECK(broadcast<5>(3.f) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
- CHECK(broadcast<6>(1.f, 2.f) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
- CHECK(broadcast<6>(1.f, 2.f, 3.f) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
-}
-
-TEST(resize)
-{
- CHECK(resize<5>(make_vector(3.f)) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
- CHECK(resize<6>(make_vector(1.f, 2.f)) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
- CHECK(resize<6>(make_vector(1.f, 2.f, 3.f)) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
-}
-
-TEST(make_vector)
-{
- const signed char ch = -1;
- CHECK(make_vector(1, 2, ch) == vec<i32, 3>{ 1, 2, -1 });
- const i64 v = -100;
- CHECK(make_vector(1, 2, v) == vec<i64, 3>{ 1, 2, -100 });
- CHECK(make_vector<i64>(1, 2, ch) == vec<i64, 3>{ 1, 2, -1 });
- CHECK(make_vector<f32>(1, 2, ch) == vec<f32, 3>{ 1, 2, -1 });
-
- CHECK(make_vector(f64x2{ 1, 2 }, f64x2{ 10, 20 }) ==
- vec<vec<f64, 2>, 2>{ f64x2{ 1, 2 }, f64x2{ 10, 20 } });
- CHECK(make_vector(1.f, f32x2{ 10, 20 }) == vec<vec<f32, 2>, 2>{ f32x2{ 1, 1 }, f32x2{ 10, 20 } });
-}
-
-TEST(apply)
-{
- CHECK(apply([](int x) { return x + 1; }, make_vector(1, 2, 3, 4, 5)) == make_vector(2, 3, 4, 5, 6));
- CHECK(apply(fn::sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25));
-}
-
-TEST(zerovector)
-{
- CHECK(zerovector<f32, 3>() == f32x3{ 0, 0, 0 });
- // CHECK(zerovector<i16, 3>() == i16x3{ 0, 0, 0 }); // clang 3.9 (trunk) crashes here
- CHECK(zerovector(f64x8{}) == f64x8{ 0, 0, 0, 0, 0, 0, 0, 0 });
-}
-
-TEST(allonesvector)
-{
- CHECK(bitcast<u32>(constants<f32>::allones()) == 0xFFFFFFFFu);
- CHECK(bitcast<u64>(constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull);
-
- CHECK(~allonesvector<f32, 3>() == f32x3{ 0, 0, 0 });
- CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 });
- CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 });
}
-TEST(low_high)
-{
- CHECK(low(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(1, 2, 3, 4));
- CHECK(high(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(5, 6, 7, 8));
-
- CHECK(low(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 4>(1, 2, 3, 4));
- CHECK(high(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 3>(5, 6, 7));
-
- CHECK(low(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 4>(1, 2, 3, 4));
- CHECK(high(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 2>(5, 6));
-
- CHECK(low(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 4>(1, 2, 3, 4));
- CHECK(high(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 1>(5));
-
- CHECK(low(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(1, 2));
- CHECK(high(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(3, 4));
-
- CHECK(low(vec<u8, 3>(1, 2, 3)) == vec<u8, 2>(1, 2));
- CHECK(high(vec<u8, 3>(1, 2, 3)) == vec<u8, 1>(3));
-
- CHECK(low(vec<u8, 2>(1, 2)) == vec<u8, 1>(1));
- CHECK(high(vec<u8, 2>(1, 2)) == vec<u8, 1>(2));
-}
-
-#ifdef CMT_COMPILER_CLANG
-TEST(matrix)
-{
- using i32x2x2 = vec<vec<int, 2>, 2>;
- const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
- CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } });
-
- CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } });
-
- i32x2 xy{ 10, 20 };
- i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
- xy = hadd(xy * m);
- CHECK(xy == i32x2{ 40, 120 });
-
- i32x2 xy2{ 10, 20 };
- xy2 = hadd(transpose(xy2 * m));
- CHECK(xy2 == i32x2{ 50, 110 });
-}
-#endif
-
-TEST(is_convertible)
-{
- static_assert(std::is_convertible<float, f32x4>::value, "");
- static_assert(std::is_convertible<float, f64x8>::value, "");
- static_assert(std::is_convertible<float, u8x3>::value, "");
-
- static_assert(std::is_convertible<u16x4, i32x4>::value, "");
- static_assert(!std::is_convertible<u16x4, i32x3>::value, "");
- static_assert(!std::is_convertible<u16x1, u16x16>::value, "");
-
- static_assert(std::is_convertible<float, complex<float>>::value, "");
- static_assert(std::is_convertible<float, complex<double>>::value, "");
- static_assert(std::is_convertible<short, complex<double>>::value, "");
-
- static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, "");
- static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, "");
-
- static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, "");
- static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, "");
-
- testo::assert_is_same<i32x4, common_type<i32x4>>();
- testo::assert_is_same<u32x4, common_type<i32x4, u32x4>>();
- testo::assert_is_same<f64x4, common_type<i32x4, u32x4, f64x4>>();
-
- CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f });
- CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 });
- CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 });
-
- CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 });
-
- CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f });
- CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. });
- CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. });
-
- CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) ==
- vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
-
- CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) ==
- vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } });
-
- CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{
- vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) ==
- vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } });
-}
-
-TEST(transcendental)
-{
- CHECK(kfr::sin(1.0f) == 0.8414709848078965066525023216303f);
- CHECK(kfr::sin(1.0) == 0.8414709848078965066525023216303);
-
- CHECK(kfr::cos(1.0f) == 0.54030230586813971740093660744298f);
- CHECK(kfr::cos(1.0) == 0.54030230586813971740093660744298);
-
- CHECK(kfr::tan(1.0f) == 1.5574077246549022305069748074584f);
- CHECK(kfr::tan(1.0) == 1.5574077246549022305069748074584);
-
- CHECK(kfr::asin(0.45f) == 0.46676533904729636185033976030414f);
- CHECK(kfr::asin(0.45) == 0.46676533904729636185033976030414);
-
- CHECK(kfr::acos(0.45f) == 1.1040309877476002573809819313356f);
- CHECK(kfr::acos(0.45) == 1.1040309877476002573809819313356);
-
- CHECK(kfr::atan(0.45f) == 0.42285392613294071296648279098114f);
- CHECK(kfr::atan(0.45) == 0.42285392613294071296648279098114);
-
- CHECK(kfr::sinh(1.0f) == 1.1752011936438014568823818505956f);
- CHECK(kfr::sinh(1.0) == 1.1752011936438014568823818505956);
-
- CHECK(kfr::cosh(1.0f) == 1.5430806348152437784779056207571f);
- CHECK(kfr::cosh(1.0) == 1.5430806348152437784779056207571);
-
- CHECK(kfr::tanh(1.0f) == 0.76159415595576488811945828260479f);
- CHECK(kfr::tanh(1.0) == 0.76159415595576488811945828260479);
-
- CHECK(kfr::exp(0.75f) == 2.1170000166126746685453698198371f);
- CHECK(kfr::exp(0.75) == 2.1170000166126746685453698198371);
-
- CHECK(kfr::exp(-0.75f) == 0.47236655274101470713804655094327f);
- CHECK(kfr::exp(-0.75) == 0.47236655274101470713804655094327);
-
- CHECK(kfr::log(2.45f) == 0.89608802455663561677548191074382f);
- CHECK(kfr::log(2.45) == 0.89608802455663561677548191074382);
-}
-
-TEST(horner)
-{
- CHECK(horner(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 17, 34));
- CHECK(horner_odd(pack(0, 1, 2, 3), 1, 2, 3) == pack(0, 6, 114, 786));
- CHECK(horner_even(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 57, 262));
-}
-
-TEST(test_stat)
-{
- {
- univector<float, 5> a({ 1, 2, 3, 4, 5 });
- CHECK(sum(a) == 15);
- CHECK(mean(a) == 3);
- CHECK(minof(a) == 1);
- CHECK(maxof(a) == 5);
- CHECK(sumsqr(a) == 55);
- CHECK(rms(a) == 3.316624790355399849115f);
- CHECK(product(a) == 120);
- }
- {
- univector<double, 5> a({ 1, 2, 3, 4, 5 });
- CHECK(sum(a) == 15);
- CHECK(mean(a) == 3);
- CHECK(minof(a) == 1);
- CHECK(maxof(a) == 5);
- CHECK(sumsqr(a) == 55);
- CHECK(rms(a) == 3.316624790355399849115);
- CHECK(product(a) == 120);
- }
- {
- univector<int, 5> a({ 1, 2, 3, 4, 5 });
- CHECK(sum(a) == 15);
- CHECK(mean(a) == 3);
- CHECK(minof(a) == 1);
- CHECK(maxof(a) == 5);
- CHECK(sumsqr(a) == 55);
- CHECK(product(a) == 120);
- }
- {
- univector<complex<float>, 5> a({ 1, 2, 3, 4, 5 });
- CHECK(sum(a) == c32{ 15 });
- CHECK(mean(a) == c32{ 3 });
- CHECK(sumsqr(a) == c32{ 55 });
- CHECK(product(a) == c32{ 120 });
- }
-}
-
-TEST(sample_conversion)
-{
- CHECK(convert_sample<float>(static_cast<i8>(-127)) == -1.f);
- CHECK(convert_sample<float>(static_cast<i8>(0)) == 0.f);
- CHECK(convert_sample<float>(static_cast<i8>(127)) == 1.f);
-
- CHECK(convert_sample<float>(static_cast<i16>(-32767)) == -1.f);
- CHECK(convert_sample<float>(static_cast<i16>(0)) == 0.f);
- CHECK(convert_sample<float>(static_cast<i16>(32767)) == 1.f);
-
- CHECK(convert_sample<float>(static_cast<i24>(-8388607)) == -1.f);
- CHECK(convert_sample<float>(static_cast<i24>(0)) == 0.f);
- CHECK(convert_sample<float>(static_cast<i24>(8388607)) == 1.f);
-
- CHECK(convert_sample<float>(static_cast<i32>(-2147483647)) == -1.f);
- CHECK(convert_sample<float>(static_cast<i32>(0)) == 0.f);
- CHECK(convert_sample<float>(static_cast<i32>(2147483647)) == 1.f);
-
- CHECK(convert_sample<i8>(-1.f) == -127);
- CHECK(convert_sample<i8>(0.f) == 0);
- CHECK(convert_sample<i8>(1.f) == 127);
-
- CHECK(convert_sample<i16>(-1.f) == -32767);
- CHECK(convert_sample<i16>(0.f) == 0);
- CHECK(convert_sample<i16>(1.f) == 32767);
-
- CHECK(convert_sample<i24>(-1.f) == -8388607);
- CHECK(convert_sample<i24>(0.f) == 0);
- CHECK(convert_sample<i24>(1.f) == 8388607);
-
- CHECK(convert_sample<i32>(-1.f) == -2147483647);
- CHECK(convert_sample<i32>(0.f) == 0);
- CHECK(convert_sample<i32>(1.f) == 2147483647);
-}
-
-TEST(sample_interleave_deinterleave)
-{
- const size_t size = 50;
- univector2d<float> in;
- in.push_back(truncate(counter() * 3.f + 0.f, size));
- in.push_back(truncate(counter() * 3.f + 1.f, size));
- in.push_back(truncate(counter() * 3.f + 2.f, size));
- univector<float> out(size * 3);
- interleave(out.data(), (const float* []){ in[0].data(), in[1].data(), in[2].data() }, 3, size);
- CHECK(maxof(out - render(counter() * 1.f, out.size())) == 0);
-
- deinterleave((float* []){ in[0].data(), in[1].data(), in[2].data() }, out.data(), 3, size);
-
- CHECK(absmaxof(in[0] - render(counter() * 3.f + 0.f, size)) == 0);
- CHECK(absmaxof(in[1] - render(counter() * 3.f + 1.f, size)) == 0);
- CHECK(absmaxof(in[2] - render(counter() * 3.f + 2.f, size)) == 0);
-}
+} // namespace CMT_ARCH_NAME
#ifndef KFR_NO_MAIN
int main()
diff --git a/tests/complex_test.cpp b/tests/complex_test.cpp
@@ -11,6 +11,9 @@
using namespace kfr;
+namespace CMT_ARCH_NAME
+{
+
TEST(complex_vector)
{
const vec<c32, 1> c32x1{ c32{ 0, 1 } };
@@ -68,9 +71,11 @@ TEST(complex_math)
{
const vec<c32, 1> a{ c32{ 1, 2 } };
const vec<c32, 1> b{ c32{ 3, 4 } };
+ CHECK(c32(vec<c32, 1>(2)[0]) == c32{ 2, 0 });
CHECK(a + b == make_vector(c32{ 4, 6 }));
CHECK(a - b == make_vector(c32{ -2, -2 }));
CHECK(a * b == make_vector(c32{ -5, 10 }));
+ CHECK(a * vec<c32, 1>(2) == make_vector(c32{ 2, 4 }));
CHECK(a * 2 == make_vector(c32{ 2, 4 }));
CHECK(a / b == make_vector(c32{ 0.44f, 0.08f }));
CHECK(-a == make_vector(c32{ -1, -2 }));
@@ -88,8 +93,7 @@ TEST(complex_math)
CHECK(cabs(-3.f) == 3.f);
CHECK(cabs(make_vector(-3.f)) == make_vector(3.f));
- testo::epsilon<f32>() *= 5;
- testo::epsilon<f64>() *= 5;
+ testo::eplison_scope<void> eps(5);
CHECK(csin(c32{ 1.f, 1.f }) == c32{ 1.2984575814159773f, 0.634963914784736f });
CHECK(ccos(c32{ 1.f, 1.f }) == c32{ 0.8337300251311489f, -0.9888977057628651f });
@@ -176,13 +180,6 @@ TEST(complex_function_expressions)
TEST(static_tests)
{
-#ifdef CMT_ARCH_SSE2
- static_assert(platform<f32, cpu_t::sse2>::vector_width == 4, "");
- static_assert(platform<c32, cpu_t::sse2>::vector_width == 2, "");
- static_assert(platform<i32, cpu_t::sse2>::vector_width == 4, "");
- static_assert(platform<complex<i32>, cpu_t::sse2>::vector_width == 2, "");
-#endif
-
static_assert(is_numeric<vec<complex<float>, 4>>::value, "");
static_assert(is_numeric_args<vec<complex<float>, 4>>::value, "");
@@ -207,8 +204,9 @@ TEST(static_tests)
testo::assert_is_same<kfr::internal::arg<complex<int>>,
kfr::internal::expression_scalar<kfr::complex<int>, 1>>();
- testo::assert_is_same<common_type<complex<int>, double>, complex<double>>();
+ testo::assert_is_same<kfr::common_type<complex<int>, double>, complex<double>>();
}
+} // namespace CMT_ARCH_NAME
#ifndef KFR_NO_MAIN
int main()
diff --git a/tests/data/acos_double_fuzz b/tests/data/acos_double_fuzz
Binary files differ.
diff --git a/tests/data/acos_double_narrow b/tests/data/acos_double_narrow
Binary files differ.
diff --git a/tests/data/acos_float_fuzz b/tests/data/acos_float_fuzz
Binary files differ.
diff --git a/tests/data/acos_float_narrow b/tests/data/acos_float_narrow
Binary files differ.
diff --git a/tests/data/asin_double_fuzz b/tests/data/asin_double_fuzz
Binary files differ.
diff --git a/tests/data/asin_double_narrow b/tests/data/asin_double_narrow
Binary files differ.
diff --git a/tests/data/asin_float_fuzz b/tests/data/asin_float_fuzz
Binary files differ.
diff --git a/tests/data/asin_float_narrow b/tests/data/asin_float_narrow
Binary files differ.
diff --git a/tests/data/atan2_double_fuzz b/tests/data/atan2_double_fuzz
Binary files differ.
diff --git a/tests/data/atan2_double_narrow b/tests/data/atan2_double_narrow
Binary files differ.
diff --git a/tests/data/atan2_float_fuzz b/tests/data/atan2_float_fuzz
Binary files differ.
diff --git a/tests/data/atan2_float_narrow b/tests/data/atan2_float_narrow
Binary files differ.
diff --git a/tests/data/atan_double_fuzz b/tests/data/atan_double_fuzz
Binary files differ.
diff --git a/tests/data/atan_double_narrow b/tests/data/atan_double_narrow
Binary files differ.
diff --git a/tests/data/atan_float_fuzz b/tests/data/atan_float_fuzz
Binary files differ.
diff --git a/tests/data/atan_float_narrow b/tests/data/atan_float_narrow
Binary files differ.
diff --git a/tests/data/cbrt_double_fuzz b/tests/data/cbrt_double_fuzz
Binary files differ.
diff --git a/tests/data/cbrt_double_narrow b/tests/data/cbrt_double_narrow
Binary files differ.
diff --git a/tests/data/cbrt_float_fuzz b/tests/data/cbrt_float_fuzz
Binary files differ.
diff --git a/tests/data/cbrt_float_narrow b/tests/data/cbrt_float_narrow
Binary files differ.
diff --git a/tests/data/cos_double_fuzz b/tests/data/cos_double_fuzz
Binary files differ.
diff --git a/tests/data/cos_double_narrow b/tests/data/cos_double_narrow
Binary files differ.
diff --git a/tests/data/cos_float_fuzz b/tests/data/cos_float_fuzz
Binary files differ.
diff --git a/tests/data/cos_float_narrow b/tests/data/cos_float_narrow
Binary files differ.
diff --git a/tests/data/cosh_double_fuzz b/tests/data/cosh_double_fuzz
Binary files differ.
diff --git a/tests/data/cosh_double_narrow b/tests/data/cosh_double_narrow
Binary files differ.
diff --git a/tests/data/cosh_float_fuzz b/tests/data/cosh_float_fuzz
Binary files differ.
diff --git a/tests/data/cosh_float_narrow b/tests/data/cosh_float_narrow
Binary files differ.
diff --git a/tests/data/coth_double_fuzz b/tests/data/coth_double_fuzz
Binary files differ.
diff --git a/tests/data/coth_double_narrow b/tests/data/coth_double_narrow
Binary files differ.
diff --git a/tests/data/coth_float_fuzz b/tests/data/coth_float_fuzz
Binary files differ.
diff --git a/tests/data/coth_float_narrow b/tests/data/coth_float_narrow
Binary files differ.
diff --git a/tests/data/exp10_double_fuzz b/tests/data/exp10_double_fuzz
Binary files differ.
diff --git a/tests/data/exp10_double_narrow b/tests/data/exp10_double_narrow
Binary files differ.
diff --git a/tests/data/exp10_float_fuzz b/tests/data/exp10_float_fuzz
Binary files differ.
diff --git a/tests/data/exp10_float_narrow b/tests/data/exp10_float_narrow
Binary files differ.
diff --git a/tests/data/exp2_double_fuzz b/tests/data/exp2_double_fuzz
Binary files differ.
diff --git a/tests/data/exp2_double_narrow b/tests/data/exp2_double_narrow
Binary files differ.
diff --git a/tests/data/exp2_float_fuzz b/tests/data/exp2_float_fuzz
Binary files differ.
diff --git a/tests/data/exp2_float_narrow b/tests/data/exp2_float_narrow
Binary files differ.
diff --git a/tests/data/exp_double_fuzz b/tests/data/exp_double_fuzz
Binary files differ.
diff --git a/tests/data/exp_double_narrow b/tests/data/exp_double_narrow
Binary files differ.
diff --git a/tests/data/exp_float_fuzz b/tests/data/exp_float_fuzz
Binary files differ.
diff --git a/tests/data/exp_float_narrow b/tests/data/exp_float_narrow
Binary files differ.
diff --git a/tests/data/gamma_double_fuzz b/tests/data/gamma_double_fuzz
Binary files differ.
diff --git a/tests/data/gamma_double_narrow b/tests/data/gamma_double_narrow
Binary files differ.
diff --git a/tests/data/gamma_float_fuzz b/tests/data/gamma_float_fuzz
Binary files differ.
diff --git a/tests/data/gamma_float_narrow b/tests/data/gamma_float_narrow
Binary files differ.
diff --git a/tests/data/log10_double_fuzz b/tests/data/log10_double_fuzz
Binary files differ.
diff --git a/tests/data/log10_double_narrow b/tests/data/log10_double_narrow
Binary files differ.
diff --git a/tests/data/log10_float_fuzz b/tests/data/log10_float_fuzz
Binary files differ.
diff --git a/tests/data/log10_float_narrow b/tests/data/log10_float_narrow
Binary files differ.
diff --git a/tests/data/log2_double_fuzz b/tests/data/log2_double_fuzz
Binary files differ.
diff --git a/tests/data/log2_double_narrow b/tests/data/log2_double_narrow
Binary files differ.
diff --git a/tests/data/log2_float_fuzz b/tests/data/log2_float_fuzz
Binary files differ.
diff --git a/tests/data/log2_float_narrow b/tests/data/log2_float_narrow
Binary files differ.
diff --git a/tests/data/log_double_fuzz b/tests/data/log_double_fuzz
Binary files differ.
diff --git a/tests/data/log_double_narrow b/tests/data/log_double_narrow
Binary files differ.
diff --git a/tests/data/log_float_fuzz b/tests/data/log_float_fuzz
Binary files differ.
diff --git a/tests/data/log_float_narrow b/tests/data/log_float_narrow
Binary files differ.
diff --git a/tests/data/sin_double_fuzz b/tests/data/sin_double_fuzz
Binary files differ.
diff --git a/tests/data/sin_double_narrow b/tests/data/sin_double_narrow
Binary files differ.
diff --git a/tests/data/sin_float_fuzz b/tests/data/sin_float_fuzz
Binary files differ.
diff --git a/tests/data/sin_float_narrow b/tests/data/sin_float_narrow
Binary files differ.
diff --git a/tests/data/sinh_double_fuzz b/tests/data/sinh_double_fuzz
Binary files differ.
diff --git a/tests/data/sinh_double_narrow b/tests/data/sinh_double_narrow
Binary files differ.
diff --git a/tests/data/sinh_float_fuzz b/tests/data/sinh_float_fuzz
Binary files differ.
diff --git a/tests/data/sinh_float_narrow b/tests/data/sinh_float_narrow
Binary files differ.
diff --git a/tests/data/tan_double_fuzz b/tests/data/tan_double_fuzz
Binary files differ.
diff --git a/tests/data/tan_double_narrow b/tests/data/tan_double_narrow
Binary files differ.
diff --git a/tests/data/tan_float_fuzz b/tests/data/tan_float_fuzz
Binary files differ.
diff --git a/tests/data/tan_float_narrow b/tests/data/tan_float_narrow
Binary files differ.
diff --git a/tests/data/tanh_double_fuzz b/tests/data/tanh_double_fuzz
Binary files differ.
diff --git a/tests/data/tanh_double_narrow b/tests/data/tanh_double_narrow
Binary files differ.
diff --git a/tests/data/tanh_float_fuzz b/tests/data/tanh_float_fuzz
Binary files differ.
diff --git a/tests/data/tanh_float_narrow b/tests/data/tanh_float_narrow
Binary files differ.
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -14,6 +14,9 @@
using namespace kfr;
+namespace CMT_ARCH_NAME
+{
+
#ifdef KFR_NATIVE_F64
constexpr ctypes_t<float, double> dft_float_types{};
#else
@@ -25,7 +28,7 @@ TEST(test_convolve)
univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
univector<fbase> c = convolve(a, b);
- CHECK(c.size() == 9);
+ CHECK(c.size() == 9u);
CHECK(rms(c - univector<fbase>({ 0.25, 1., 2.75, 2.5, 3.75, 3.5, 1.5, -4., 7.5 })) < 0.0001);
}
@@ -44,7 +47,7 @@ TEST(test_correlate)
univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
univector<fbase> c = correlate(a, b);
- CHECK(c.size() == 9);
+ CHECK(c.size() == 9u);
CHECK(rms(c - univector<fbase>({ 1.5, 1., 1.5, 2.5, 3.75, -4., 7.75, 3.5, 1.25 })) < 0.0001);
}
@@ -87,58 +90,60 @@ TEST(fft_accuracy)
#endif
println(sizes);
- testo::matrix(
- named("type") = dft_float_types, //
- named("size") = sizes, //
- [&gen](auto type, size_t size) {
- using float_type = type_of<decltype(type)>;
- const double min_prec = 0.000001 * std::log(size) * size;
-
- for (bool inverse : { false, true })
- {
- testo::active_test()->append_comment(inverse ? "complex-inverse" : "complex-direct");
- univector<complex<float_type>> in =
- truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
- univector<complex<float_type>> out = in;
- univector<complex<float_type>> refout = out;
- univector<complex<float_type>> outo = in;
- const dft_plan<float_type> dft(size);
- univector<u8> temp(dft.temp_size);
-
- reference_dft(refout.data(), in.data(), size, inverse);
- dft.execute(outo, in, temp, inverse);
- dft.execute(out, out, temp, inverse);
-
- const float_type rms_diff_inplace = rms(cabs(refout - out));
- CHECK(rms_diff_inplace < min_prec);
- const float_type rms_diff_outofplace = rms(cabs(refout - outo));
- CHECK(rms_diff_outofplace < min_prec);
- }
-
- if (size >= 4 && is_poweroftwo(size))
- {
- univector<float_type> in = truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
-
- univector<complex<float_type>> out = truncate(scalar(qnan), size);
- univector<complex<float_type>> refout = truncate(scalar(qnan), size);
- const dft_plan_real<float_type> dft(size);
- univector<u8> temp(dft.temp_size);
-
- testo::active_test()->append_comment("real-direct");
- reference_fft(refout.data(), in.data(), size);
- dft.execute(out, in, temp);
- float_type rms_diff = rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
- CHECK(rms_diff < min_prec);
-
- univector<float_type> out2(size, 0.f);
- testo::active_test()->append_comment("real-inverse");
- dft.execute(out2, out, temp);
- out2 = out2 / size;
- rms_diff = rms(in - out2);
- CHECK(rms_diff < min_prec);
- }
- });
+ testo::matrix(named("type") = dft_float_types, //
+ named("size") = sizes, //
+ [&gen](auto type, size_t size) {
+ using float_type = type_of<decltype(type)>;
+ const double min_prec = 0.000001 * std::log(size) * size;
+
+ for (bool inverse : { false, true })
+ {
+ testo::scope s(inverse ? "complex-inverse" : "complex-direct");
+ univector<complex<float_type>> in =
+ truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+ univector<complex<float_type>> out = in;
+ univector<complex<float_type>> refout = out;
+ univector<complex<float_type>> outo = in;
+ const dft_plan<float_type> dft(size);
+ univector<u8> temp(dft.temp_size);
+
+ reference_dft(refout.data(), in.data(), size, inverse);
+ dft.execute(outo, in, temp, inverse);
+ dft.execute(out, out, temp, inverse);
+
+ const float_type rms_diff_inplace = rms(cabs(refout - out));
+ CHECK(rms_diff_inplace < min_prec);
+ const float_type rms_diff_outofplace = rms(cabs(refout - outo));
+ CHECK(rms_diff_outofplace < min_prec);
+ }
+
+ if (size >= 4 && is_poweroftwo(size))
+ {
+ univector<float_type> in =
+ truncate(gen_random_range<float_type>(gen, -1.0, +1.0), size);
+
+ univector<complex<float_type>> out = truncate(scalar(qnan), size);
+ univector<complex<float_type>> refout = truncate(scalar(qnan), size);
+ const dft_plan_real<float_type> dft(size);
+ univector<u8> temp(dft.temp_size);
+
+ testo::scope s("real-direct");
+ reference_fft(refout.data(), in.data(), size);
+ dft.execute(out, in, temp);
+ float_type rms_diff =
+ rms(cabs(refout.truncate(size / 2 + 1) - out.truncate(size / 2 + 1)));
+ CHECK(rms_diff < min_prec);
+
+ univector<float_type> out2(size, 0.f);
+ s.text = "real-inverse";
+ dft.execute(out2, out, temp);
+ out2 = out2 / size;
+ rms_diff = rms(in - out2);
+ CHECK(rms_diff < min_prec);
+ }
+ });
}
+} // namespace CMT_ARCH_NAME
#ifndef KFR_NO_MAIN
int main()
diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp
@@ -15,6 +15,9 @@
using namespace kfr;
+namespace CMT_ARCH_NAME
+{
+
struct TestFragment
{
float gain; // dB
@@ -235,6 +238,13 @@ TEST(ebu_lra_1_2_3_and_4)
});
}
+TEST(note_to_hertz)
+{
+ testo::eplison_scope<void> eps(1000);
+ CHECK(kfr::note_to_hertz(60) == fbase(261.6255653005986346778499935233));
+ CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
+}
+
TEST(delay)
{
const univector<float, 33> v1 = counter() + 100;
@@ -265,7 +275,7 @@ TEST(mixdown)
[](size_t i) { return i + i * 2 + 100; });
}
-#ifdef CMT_COMPILER_CLANG
+#ifdef CMT_COMPILER_CLANG__
TEST(mixdown_stereo)
{
const univector<double, 21> left = counter();
@@ -289,29 +299,85 @@ TEST(phasor)
TEST(fir)
{
- const univector<double, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
- const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
-
- CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> double {
- double result = 0.0;
- for (size_t i = 0; i < taps.size(); i++)
- result += data.get(index - i, 0.0) * taps[i];
- return result;
- });
+#ifdef CMT_COMPILER_MSVC
+ // testo::matrix causes error in MSVC
+ {
+ using T = float;
+
+ const univector<T, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
+ const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+
+ CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T {
+ T result = 0;
+ for (size_t i = 0; i < taps.size(); i++)
+ result += data.get(index - i, 0) * taps[i];
+ return result;
+ });
+
+ CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
+ T result = 0;
+ for (size_t i = 0; i < taps.size(); i++)
+ result += data.get(index - i, 0) * taps[i];
+ return result;
+ });
+ }
+ {
+ using T = double;
+
+ const univector<T, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
+ const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+
+ CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T {
+ T result = 0;
+ for (size_t i = 0; i < taps.size(); i++)
+ result += data.get(index - i, 0) * taps[i];
+ return result;
+ });
+
+ CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
+ T result = 0;
+ for (size_t i = 0; i < taps.size(); i++)
+ result += data.get(index - i, 0) * taps[i];
+ return result;
+ });
+ }
+#else
+ testo::matrix(named("type") = ctypes_t<float
+#ifdef KFR_NATIVE_F64
+ ,
+ double
+#endif
+ >{},
+ [](auto type) {
+ using T = type_of<decltype(type)>;
- CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> double {
- double result = 0.0;
- for (size_t i = 0; i < taps.size(); i++)
- result += data.get(index - i, 0.0) * taps[i];
- return result;
- });
+ const univector<T, 100> data =
+ counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5);
+ const univector<T, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+
+ CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> T {
+ T result = 0;
+ for (size_t i = 0; i < taps.size(); i++)
+ result += data.get(index - i, 0) * taps[i];
+ return result;
+ });
+
+ CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
+ T result = 0;
+ for (size_t i = 0; i < taps.size(); i++)
+ result += data.get(index - i, 0) * taps[i];
+ return result;
+ });
+ });
+#endif
}
#ifdef KFR_NATIVE_F64
TEST(fir_different)
{
const univector<float, 100> data = counter() + sequence(1, 2, -10, 100) + sequence(0, -7, 0.5f);
- const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+ // const univector<double, 6> taps{ 1, 2, -2, 0.5, 0.0625, 4 };
+ const univector<double, 4> taps{ 1, 2, 3, 4 };
CHECK_EXPRESSION(fir(data, taps), 100, [&](size_t index) -> float {
double result = 0.0;
@@ -375,6 +441,114 @@ TEST(fir_complex)
});
}
+template <typename E, typename T, size_t size>
+void test_ir(E&& e, const univector<T, size>& test_vector)
+{
+ substitute(e, to_pointer(unitimpulse<T>()));
+ const univector<T, size> ir = e;
+ println(absmaxof(ir - test_vector));
+}
+
+template <typename T, typename... Ts, univector_tag Tag>
+inline const univector<T, Tag>& choose_array(const univector<T, Tag>& array, const univector<Ts, Tag>&...)
+{
+ return array;
+}
+
+template <typename T, typename T2, typename... Ts, univector_tag Tag, KFR_ENABLE_IF(!is_same<T, T2>::value)>
+inline const univector<T, Tag>& choose_array(const univector<T2, Tag>&, const univector<Ts, Tag>&... arrays)
+{
+ return choose_array<T>(arrays...);
+}
+
+TEST(biquad_lowpass1)
+{
+ testo::matrix(named("type") = ctypes_t<float, double>{}, [](auto type) {
+ using T = type_of<decltype(type)>;
+
+ const biquad_params<T> bq = biquad_lowpass<T>(0.1, 0.7);
+
+ constexpr size_t size = 32;
+
+ const univector<float, size> test_vector_f32{
+ +0x8.9bce2p-7, +0xd.8383ep-6, +0x8.f908dp-5, +0xe.edc21p-6, +0x9.ae104p-6, +0x9.dcc24p-7,
+ +0xd.50584p-9, -0xf.2668p-13, -0xd.09ca1p-10, -0xe.15995p-10, -0xa.b90d2p-10, -0xc.edea4p-11,
+ -0xb.f14eap-12, -0xc.2cb44p-14, +0xb.4a4dep-15, +0xb.685dap-14, +0xa.b181fp-14, +0xf.0cb2bp-15,
+ +0x8.695d6p-15, +0xd.bedd4p-17, +0xf.5474p-20, -0xd.bb266p-19, -0x9.63ca1p-18, -0xf.ca567p-19,
+ -0xa.5231p-19, -0xa.9e934p-20, -0xe.ab52p-22, +0xa.3c4cp-26, +0xd.721ffp-23, +0xe.ccc1ap-23,
+ +0xb.5f248p-23, +0xd.d2c9ap-24,
+ };
+
+ const univector<double, size> test_vector_f64{
+ +0x8.9bce2bf3663e8p-7, +0xd.8384010fdf1dp-6, +0x8.f908e7a36df6p-5, +0xe.edc2332a6d0bp-6,
+ +0x9.ae104af1da9ap-6, +0x9.dcc235ef68e7p-7, +0xd.5057ee425e05p-9, -0xf.266e42a99aep-13,
+ -0xd.09cad73642208p-10, -0xe.1599f32a83dp-10, -0xa.b90d8910a117p-10, -0xc.edeaabb890948p-11,
+ -0xb.f14edbb55383p-12, -0xc.2cb39b86f2dap-14, +0xb.4a506ecff055p-15, +0xb.685edfdb55358p-14,
+ +0xa.b182e32f8e298p-14, +0xf.0cb3dfd894b2p-15, +0x8.695df725b4438p-15, +0xd.beddc3606b9p-17,
+ +0xf.547004d20874p-20, -0xd.bb29b25b49b6p-19, -0x9.63cb9187da1dp-18, -0xf.ca588634fc618p-19,
+ -0xa.52322d320da78p-19, -0xa.9e9420154e4p-20, -0xe.ab51f7b0335ap-22, +0xa.3c6479980e1p-26,
+ +0xd.7223836599fp-23, +0xe.ccc47ddd18678p-23, +0xb.5f265b1be1728p-23, +0xd.d2cb83f8483f8p-24,
+ };
+
+ const univector<T, size> ir = biquad(bq, unitimpulse<T>());
+
+ CHECK(absmaxof(choose_array<T>(test_vector_f32, test_vector_f64) - ir) == 0);
+ });
+}
+
+TEST(biquad_lowpass2)
+{
+ testo::matrix(named("type") = ctypes_t<float, double>{}, [](auto type) {
+ using T = type_of<decltype(type)>;
+
+ const biquad_params<T> bq = biquad_lowpass<T>(0.45, 0.2);
+
+ constexpr size_t size = 32;
+
+ const univector<float, size> test_vector_f32{
+ +0x8.ce416p-4, +0x8.2979p-4, -0x8.a9d04p-7, +0xe.aeb3p-11, +0x8.204f8p-13, -0x8.20d78p-12,
+ +0x8.3379p-12, -0xf.83d81p-13, +0xe.8b5c4p-13, -0xd.9ddadp-13, +0xc.bedfcp-13, -0xb.ee123p-13,
+ +0xb.2a9e5p-13, -0xa.73ac4p-13, +0x9.c86f6p-13, -0x9.2828p-13, +0x8.92229p-13, -0x8.05b7p-13,
+ +0xf.048ffp-14, -0xe.0e849p-14, +0xd.28384p-14, -0xc.50a9p-14, +0xb.86e56p-14, -0xa.ca0b6p-14,
+ +0xa.19476p-14, -0x9.73d38p-14, +0x8.d8f64p-14, -0x8.48024p-14, +0xf.80aa2p-15, -0xe.82ad8p-15,
+ +0xd.94f22p-15, -0xc.b66d9p-15,
+ };
+
+ const univector<double, size> test_vector_f64{
+ +0x8.ce416c0d31e88p-4, +0x8.2978efe51dafp-4, -0x8.a9d088b81da6p-7, +0xe.aeb56c029358p-11,
+ +0x8.20492639873ap-13, -0x8.20d4e21aab538p-12, +0x8.3376b2d53b4a8p-12, -0xf.83d3d1c17343p-13,
+ +0xe.8b584f0dd5ac8p-13, -0xd.9dd740ceaacf8p-13, +0xc.bedc85e7a621p-13, -0xb.ee0f472bf8968p-13,
+ +0xb.2a9baed1fe6cp-13, -0xa.73a9d1670f4ep-13, +0x9.c86d29d297798p-13, -0x9.2825f4d894088p-13,
+ +0x8.9220a956d651p-13, -0x8.05b539fdd79e8p-13, +0xf.048cb5194cfa8p-14, -0xe.0e819fa128938p-14,
+ +0xd.2835957d684cp-14, -0xc.50a69c2a8dc18p-14, +0xb.86e33bbaf3cbp-14, -0xa.ca097058af2cp-14,
+ +0xa.1945ad1703dcp-14, -0x9.73d1eef7d8b68p-14, +0x8.d8f4df1bb3efp-14, -0x8.48010323c6f7p-14,
+ +0xf.80a7f5baeeb2p-15, -0xe.82ab94bb68a8p-15, +0xd.94f05f80af008p-15, -0xc.b66c0799b21a8p-15,
+ };
+
+ const univector<T, size> ir = biquad(bq, unitimpulse<T>());
+
+ CHECK(absmaxof(choose_array<T>(test_vector_f32, test_vector_f64) - ir) == 0);
+ });
+}
+
+TEST(resampler_test)
+{
+ const int in_sr = 44100;
+ const int out_sr = 48000;
+ const int freq = 100;
+ auto resampler = sample_rate_converter<fbase>(resample_quality::draft, out_sr, in_sr);
+ double delay = resampler.get_fractional_delay();
+ univector<fbase> out(out_sr / 10);
+ univector<fbase> in = truncate(sin(c_pi<fbase> * phasor<fbase>(freq, in_sr, 0)), in_sr / 10);
+ univector<fbase> ref = truncate(
+ sin(c_pi<fbase> * phasor<fbase>(freq, out_sr, -delay * (static_cast<double>(freq) / out_sr))),
+ out_sr / 10);
+ resampler.process(out, in);
+
+ CHECK(rms(slice(out - ref, static_cast<size_t>(ceil(delay * 2)))) < 0.005f);
+}
+} // namespace CMT_ARCH_NAME
+
#ifndef KFR_NO_MAIN
int main()
{
diff --git a/tests/ebu_test.cpp b/tests/ebu_test.cpp
@@ -1,122 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016 D Levin
- * See LICENSE.txt for details
- */
-
-#include <kfr/testo/testo.hpp>
-
-#include <kfr/base.hpp>
-#include <kfr/dsp.hpp>
-#include <kfr/io.hpp>
-
-using namespace kfr;
-
-int main(int argc, char** argv)
-{
- if (argc < 3)
- {
- println("Usage: ebu_test INPUT_IN_F32_RAW_FORMAT CHANNEL_NUMBER");
- return 1;
- }
-
- // Prepare
- FILE* f = fopen(argv[1], "rb");
- const int channel_number = atoi(argv[2]);
- if (channel_number < 1 || channel_number > 6)
- {
- println("Incorrect number of channels");
- return 1;
- }
- fseek(f, 0, SEEK_END);
- uintmax_t size = ftell(f);
- fseek(f, 0, SEEK_SET);
- if (size % (sizeof(float) * channel_number))
- {
- println("Incorrect file size");
- return 1;
- }
-
- // Read file
- const size_t length = size / (sizeof(float) * channel_number);
- univector<float> interleaved(size / sizeof(float));
- size_t read_len = fread(interleaved.data(), 1, size, f);
- if (read_len != size)
- {
- println("Can't read file");
- return 1;
- }
-
- // Deinterleave
- univector<univector<float>> data(channel_number, univector<float>(length));
- for (size_t ch = 0; ch < channel_number; ++ch)
- {
- for (size_t i = 0; i < length; ++i)
- {
- data[ch][i] = interleaved[i * channel_number + ch];
- }
- }
-
- std::vector<Speaker> speakers;
- switch (channel_number)
- {
- case 1:
- speakers = { Speaker::Mono };
- break;
- case 2:
- speakers = { Speaker::Left, Speaker::Right };
- break;
- case 3:
- speakers = { Speaker::Left, Speaker::Right, Speaker::Center };
- break;
- case 4:
- speakers = { Speaker::Left, Speaker::Right, Speaker::LeftSurround, Speaker::RightSurround };
- break;
- case 5:
- speakers = { Speaker::Left, Speaker::Right, Speaker::Center, Speaker::LeftSurround,
- Speaker::RightSurround };
- break;
- case 6:
- speakers = { Speaker::Left, Speaker::Right, Speaker::Center,
- Speaker::LeftSurround, Speaker::RightSurround, Speaker::Lfe };
- break;
- }
-
- ebu_r128<float> loudness(48000, speakers);
-
- float M, S, I, RL, RH;
- float maxM = -HUGE_VALF, maxS = -HUGE_VALF;
- for (size_t i = 0; i < length / loudness.packet_size(); i++)
- {
- std::vector<univector_ref<float>> channels;
- for (size_t ch = 0; ch < channel_number; ++ch)
- {
- channels.push_back(data[ch].slice(i * loudness.packet_size(), loudness.packet_size()));
- }
- loudness.process_packet(channels);
- loudness.get_values(M, S, I, RL, RH);
- maxM = std::max(maxM, M);
- maxS = std::max(maxS, S);
- }
-
- {
- // For file-based measurements, the signal should be followed by at least 1.5 s of silence
- std::vector<univector_dyn<float>> channels(channel_number,
- univector_dyn<float>(loudness.packet_size()));
- for (size_t i = 0; i < 15; ++i)
- loudness.process_packet(channels);
- float dummyM, dummyS, dummyI;
- loudness.get_values(dummyM, dummyS, dummyI, RL, RH);
- }
-
- println(argv[1]);
- println("M = ", M);
- println("S = ", S);
- println("I = ", I);
- println("LRA = ", RH - RL);
- println("maxM = ", maxM);
- println("maxS = ", maxS);
- println();
-
- return 0;
-}
diff --git a/tests/empty_test.cpp b/tests/empty_test.cpp
@@ -1,5 +0,0 @@
-#include <kfr/all.hpp>
-
-using namespace kfr;
-
-int main() {}
diff --git a/tests/expression_test.cpp b/tests/expression_test.cpp
@@ -13,6 +13,9 @@
using namespace kfr;
+namespace CMT_ARCH_NAME
+{
+
TEST(pack)
{
const univector<float, 21> v1 = 1 + counter();
@@ -59,6 +62,17 @@ TEST(test_arg_access)
CHECK_EXPRESSION(e1, 10, [](size_t i) { return (i == 0 ? 100 : i) + 1; });
}
+TEST(to_pointer)
+{
+ auto e1 = to_pointer(counter<float>());
+
+ CHECK_EXPRESSION(e1, infinite_size, [](size_t i) { return static_cast<float>(i); });
+
+ auto e2 = to_pointer(gen_linear(0.f, 1.f));
+
+ CHECK_EXPRESSION(e2, infinite_size, [](size_t i) { return static_cast<float>(i); });
+}
+
TEST(test_arg_replace)
{
univector<float, 10> v1 = counter();
@@ -88,11 +102,11 @@ TEST(placeholders_pointer)
TEST(univector_assignment)
{
univector<int> x = truncate(counter(), 10);
- CHECK(x.size() == 10);
+ CHECK(x.size() == 10u);
univector<int> y;
y = truncate(counter(), 10);
- CHECK(y.size() == 10);
+ CHECK(y.size() == 10u);
}
TEST(size_calc)
@@ -102,9 +116,9 @@ TEST(size_calc)
auto b = slice(counter(), 100);
CHECK(b.size() == infinite_size);
auto c = slice(counter(), 100, 1000);
- CHECK(c.size() == 1000);
+ CHECK(c.size() == 1000u);
auto d = slice(c, 100);
- CHECK(d.size() == 900);
+ CHECK(d.size() == 900u);
}
TEST(reverse)
@@ -126,8 +140,8 @@ TEST(partition)
{
univector<double, 385> output = zeros();
auto result = partition(output, counter(), 5, 1);
- CHECK(result.count == 5);
- CHECK(result.chunk_size == 80);
+ CHECK(result.count == 5u);
+ CHECK(result.chunk_size == 80u);
result(0);
CHECK(sum(output) >= fast_range_sum(80 - 1));
@@ -144,8 +158,8 @@ TEST(partition)
{
univector<double, 385> output = zeros();
auto result = partition(output, counter(), 5, 160);
- CHECK(result.count == 3);
- CHECK(result.chunk_size == 160);
+ CHECK(result.count == 3u);
+ CHECK(result.chunk_size == 160u);
result(0);
CHECK(sum(output) >= fast_range_sum(160 - 1));
@@ -155,6 +169,7 @@ TEST(partition)
CHECK(sum(output) == fast_range_sum(385 - 1));
}
}
+} // namespace CMT_ARCH_NAME
#ifndef KFR_NO_MAIN
int main()
diff --git a/tests/generate_data.cpp b/tests/generate_data.cpp
@@ -0,0 +1,114 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+#define _USE_MATH_DEFINES
+
+#include "mpfr/mpfrplus.hpp"
+#include <kfr/cometa.hpp>
+#include <kfr/cometa/ctti.hpp>
+#include <kfr/cometa/function.hpp>
+#include <kfr/io/file.hpp>
+#include <random>
+
+constexpr size_t points = 10000;
+constexpr size_t points_2arg = 100;
+
+constexpr size_t fuzz_points = 10000;
+constexpr size_t fuzz_points_2arg = 100;
+
+using namespace kfr;
+
+using testo::test_data_entry;
+
+template <typename T>
+struct range_sampler
+{
+ double min;
+ double max;
+ T operator()(size_t i, size_t num) { return static_cast<T>(min + (max - min) * i / (points - 1)); }
+};
+
+template <typename T>
+struct fuzz_sampler
+{
+ std::mt19937_64 rnd{ 12345 };
+ T operator()(size_t i, size_t num) { return bitcast_anything<T>(static_cast<utype<T>>(rnd())); }
+};
+
+template <typename T, typename Sampler>
+void generate_table(const std::shared_ptr<file_writer<test_data_entry<T, 1>>>& writer,
+ cometa::function<mpfr::number(const mpfr::number&)> func, Sampler&& sampler)
+{
+ for (size_t i = 0; i < points; i++)
+ {
+ test_data_entry<T, 1> entry;
+ entry.arguments[0] = sampler(i, points);
+ entry.result = static_cast<T>(func(entry.arguments[0]));
+ writer->write(entry);
+ }
+}
+
+template <typename T, typename Sampler>
+void generate_table(const std::shared_ptr<file_writer<test_data_entry<T, 2>>>& writer,
+ cometa::function<mpfr::number(const mpfr::number&, const mpfr::number&)> func,
+ Sampler&& sampler)
+{
+ for (size_t i = 0; i < points_2arg; i++)
+ {
+ for (size_t j = 0; j < points_2arg; j++)
+ {
+ test_data_entry<T, 2> entry;
+ entry.arguments[0] = sampler(i, points_2arg);
+ entry.arguments[1] = sampler(j, points_2arg);
+ entry.result = static_cast<T>(func(entry.arguments[0], entry.arguments[1]));
+ writer->write(entry);
+ }
+ }
+}
+
+template <int args, typename Func>
+void generate_test(cint_t<args>, const char* name, const Func& func, double min, double max)
+{
+ generate_table(open_file_for_writing<test_data_entry<float, args>>(as_string(name, "_float_narrow")),
+ func, range_sampler<float>{ min, max });
+ generate_table(open_file_for_writing<test_data_entry<double, args>>(as_string(name, "_double_narrow")),
+ func, range_sampler<double>{ min, max });
+
+ generate_table(open_file_for_writing<test_data_entry<float, args>>(as_string(name, "_float_fuzz")), func,
+ fuzz_sampler<float>{});
+ generate_table(open_file_for_writing<test_data_entry<double, args>>(as_string(name, "_double_fuzz")),
+ func, fuzz_sampler<double>{});
+}
+
+int main()
+{
+ using num = mpfr::number;
+ mpfr::scoped_precision prec(512);
+ generate_test(cint<1>, "sin", [](const num& x) { return mpfr::sin(x); }, 0, M_PI * 2);
+ generate_test(cint<1>, "cos", [](const num& x) { return mpfr::cos(x); }, 0, M_PI * 2);
+ generate_test(cint<1>, "tan", [](const num& x) { return mpfr::tan(x); }, 0, M_PI);
+
+ generate_test(cint<1>, "asin", [](const num& x) { return mpfr::asin(x); }, 0, 1);
+ generate_test(cint<1>, "acos", [](const num& x) { return mpfr::acos(x); }, 0, 1);
+ generate_test(cint<1>, "atan", [](const num& x) { return mpfr::atan(x); }, 0, 1);
+ generate_test(cint<2>, "atan2", [](const num& x, const num& y) { return mpfr::atan2(x, y); }, 0, 10);
+
+ generate_test(cint<1>, "sinh", [](const num& x) { return mpfr::sinh(x); }, 0, 10 * 2);
+ generate_test(cint<1>, "cosh", [](const num& x) { return mpfr::cosh(x); }, 0, 10 * 2);
+ generate_test(cint<1>, "tanh", [](const num& x) { return mpfr::tanh(x); }, 0, 10 * 2);
+ generate_test(cint<1>, "coth", [](const num& x) { return mpfr::coth(x); }, 0, 10 * 2);
+
+ generate_test(cint<1>, "gamma", [](const num& x) { return mpfr::gamma(x); }, 0, 10);
+
+ generate_test(cint<1>, "log", [](const num& x) { return mpfr::log(x); }, 0, 100);
+ generate_test(cint<1>, "log2", [](const num& x) { return mpfr::log2(x); }, 0, 100);
+ generate_test(cint<1>, "log10", [](const num& x) { return mpfr::log10(x); }, 0, 100);
+
+ generate_test(cint<1>, "exp", [](const num& x) { return mpfr::exp(x); }, -10, 10);
+ generate_test(cint<1>, "exp2", [](const num& x) { return mpfr::exp2(x); }, -10, 10);
+ generate_test(cint<1>, "exp10", [](const num& x) { return mpfr::exp10(x); }, -10, 10);
+
+ generate_test(cint<1>, "cbrt", [](const num& x) { return mpfr::cbrt(x); }, 0, 1000);
+}
diff --git a/tests/intrinsic_test.cpp b/tests/intrinsic_test.cpp
@@ -7,44 +7,12 @@
#include <kfr/testo/testo.hpp>
#include <kfr/base.hpp>
-#include <kfr/dsp.hpp>
#include <kfr/io.hpp>
using namespace kfr;
-constexpr ctypes_t<i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64, i8x3, //
- i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x3, //
- i32x1, i32x2, i32x4, i32x8, i32x16, i32x3 //
-#ifdef KFR_NATIVE_I64
- ,
- i64x1, i64x2, i64x4, i64x8, i64x16, i64x3 //
-#endif
- >
- signed_types{};
-
-constexpr ctypes_t<u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64, u8x3, //
- u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x3, //
- u32x1, u32x2, u32x4, u32x8, u32x16, u32x3 //
-#ifdef KFR_NATIVE_I64
- ,
- u64x1, u64x2, u64x4, u64x8, u64x16, u64x3 //
-#endif
- >
- unsigned_types{};
-
-constexpr ctypes_t<f32x1, f32x2, f32x4, f32x8, f32x16, f32x3 //
-#ifdef KFR_NATIVE_F64
- ,
- f64x1, f64x2, f64x4, f64x8, f64x16, f64x3 //
-#endif
- >
- float_types{};
-
-template <typename T>
-inline T ref_abs(T x)
+namespace CMT_ARCH_NAME
{
- return x >= T(0) ? x : -x;
-}
template <typename T>
bool builtin_add_overflow(T x, T y, T* r)
@@ -127,43 +95,6 @@ inline T ref_satsub(T x, T y)
return result;
}
-TEST(intrin_select)
-{
- testo::matrix(named("type") = cconcat(signed_types, cconcat(unsigned_types, float_types)), [](auto type) {
- using Tvec = type_of<decltype(type)>;
- using T = subtype<Tvec>;
- CHECK(kfr::select(make_mask<T>(false), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(2));
- CHECK(kfr::select(make_mask<T>(true), make_vector<T>(1), make_vector<T>(2)) == make_vector<T>(1));
- });
-}
-
-TEST(intrin_abs)
-{
- testo::assert_is_same<decltype(kfr::abs(1)), int>();
- testo::assert_is_same<decltype(kfr::abs(1u)), unsigned int>();
- testo::assert_is_same<decltype(kfr::abs(make_vector(1))), i32x1>();
- testo::assert_is_same<decltype(kfr::abs(make_vector(1, 2))), i32x2>();
- CHECK(kfr::abs(9u) == 9u);
- CHECK(kfr::abs(9) == 9);
- CHECK(kfr::abs(-9) == 9);
- CHECK(kfr::abs(-infinity) == infinity);
- CHECK(kfr::abs(make_vector(9)) == make_vector(9));
- CHECK(kfr::abs(make_vector(-9)) == make_vector(9));
-
- testo::matrix(named("type") = signed_types, named("value") = std::vector<int>{ -1, 0, +1 },
- [](auto type, int value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x));
- });
- testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ -1, 0, +1 },
- [](auto type, int value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(kfr::abs(x) == apply([](auto x) { return ref_abs(x); }, x));
- });
-}
-
TEST(intrin_sqrt)
{
testo::assert_is_same<decltype(kfr::sqrt(9)), fbase>();
@@ -175,141 +106,45 @@ TEST(intrin_sqrt)
CHECK(kfr::sqrt(-9) == fbase(qnan));
CHECK(kfr::sqrt(make_vector(9)) == make_vector<fbase>(3.0));
CHECK(kfr::sqrt(make_vector(-9)) == make_vector<fbase>(qnan));
- testo::matrix(named("type") = float_types, named("value") = std::vector<int>{ 0, 2, 65536 },
+ testo::matrix(named("type") = float_vector_types<vec>, named("value") = std::vector<int>{ 0, 2, 65536 },
[](auto type, int value) {
using T = type_of<decltype(type)>;
const T x(value);
- CHECK(kfr::sqrt(x) == apply([](auto x) { return std::sqrt(x); }, x));
- });
-}
-
-TEST(intrin_round)
-{
- testo::assert_is_same<decltype(kfr::floor(100)), int>();
- testo::assert_is_same<decltype(kfr::ceil(100)), int>();
- testo::assert_is_same<decltype(kfr::round(100)), int>();
- testo::assert_is_same<decltype(kfr::trunc(100)), int>();
- testo::assert_is_same<decltype(kfr::fract(100)), int>();
-
- testo::assert_is_same<decltype(kfr::ifloor(100.f)), int>();
- testo::assert_is_same<decltype(kfr::iceil(100.f)), int>();
- testo::assert_is_same<decltype(kfr::iround(100.f)), int>();
- testo::assert_is_same<decltype(kfr::itrunc(100.f)), int>();
- CHECK(kfr::floor(100) == 100);
- CHECK(kfr::ceil(100) == 100);
- CHECK(kfr::round(100) == 100);
- CHECK(kfr::trunc(100) == 100);
- CHECK(kfr::fract(100) == 0);
-
- testo::matrix(named("type") = float_types,
- named("value") = std::vector<fbase>{ -1.51, -1.49, 0.0, +1.49, +1.51 },
- [](auto type, fbase value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(kfr::floor(x) == apply([](auto x) { return std::floor(x); }, x));
- CHECK(kfr::ceil(x) == apply([](auto x) { return std::ceil(x); }, x));
- CHECK(kfr::round(x) == apply([](auto x) { return std::round(x); }, x));
- CHECK(kfr::trunc(x) == apply([](auto x) { return std::trunc(x); }, x));
- CHECK(kfr::fract(x) == apply([](auto x) { return x - std::floor(x); }, x));
- });
-}
-
-TEST(intrin_min_max)
-{
- testo::assert_is_same<decltype(min(1, 2)), int>();
- testo::assert_is_same<decltype(min(1, 2u)), unsigned int>();
- testo::assert_is_same<decltype(min(1, 2)), int>();
- testo::assert_is_same<decltype(min(pack(1), 2u)), u32x1>();
- testo::assert_is_same<decltype(min(2u, pack(1))), u32x1>();
- testo::assert_is_same<decltype(min(pack(1), pack(2u))), u32x1>();
- testo::assert_is_same<decltype(min(pack(1, 2, 3), pack(1.0, 2.0, 3.0))), f64x3>();
- testo::assert_is_same<decltype(min(pack(1.0, 2.0, 3.0), pack(1, 2, 3))), f64x3>();
-
- CHECK(min(1, 2) == 1);
- CHECK(min(1, 2u) == 1u);
- CHECK(min(pack(1), 2) == pack(1));
- CHECK(min(pack(1, 2, 3), 2) == pack(1, 2, 2));
- CHECK(min(pack(1., 2., 3.), 2) == pack(1., 2., 2.));
-
- testo::matrix(named("type") = float_types,
- named("value") = std::vector<std::pair<fbase, fbase>>{ { -100, +100 }, { infinity, 0.0 } },
- [](auto type, std::pair<fbase, fbase> value) {
- using T = type_of<decltype(type)>;
- const T x(value.first);
- const T y(value.second);
- CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y));
- CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y));
- CHECK(kfr::absmin(x, y) ==
- apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y));
- CHECK(kfr::absmax(x, y) ==
- apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y));
- });
- testo::matrix(named("type") = signed_types,
- named("value") = std::vector<std::pair<int, int>>{ { -100, +100 } },
- [](auto type, std::pair<int, int> value) {
- using T = type_of<decltype(type)>;
- const T x(value.first);
- const T y(value.second);
- CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y));
- CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y));
- CHECK(kfr::absmin(x, y) ==
- apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y));
- CHECK(kfr::absmax(x, y) ==
- apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y));
- });
- testo::matrix(named("type") = unsigned_types,
- named("value") = std::vector<std::pair<unsigned, unsigned>>{ { 0, +200 } },
- [](auto type, std::pair<unsigned, unsigned> value) {
- using T = type_of<decltype(type)>;
- const T x(value.first);
- const T y(value.second);
- CHECK(kfr::min(x, y) == apply([](auto x, auto y) { return std::min(x, y); }, x, y));
- CHECK(kfr::max(x, y) == apply([](auto x, auto y) { return std::max(x, y); }, x, y));
- CHECK(kfr::absmin(x, y) ==
- apply([](auto x, auto y) { return std::min(ref_abs(x), ref_abs(y)); }, x, y));
- CHECK(kfr::absmax(x, y) ==
- apply([](auto x, auto y) { return std::max(ref_abs(x), ref_abs(y)); }, x, y));
+ CHECK(kfr::sqrt(x) == apply([](auto x) -> decltype(x) { return std::sqrt(x); }, x));
});
}
TEST(intrin_satadd_satsub)
{
- testo::matrix(named("type") = signed_types, [](auto type) {
- using T = type_of<decltype(type)>;
- using Tsub = subtype<T>;
- const T min = std::numeric_limits<Tsub>::min();
- const T max = std::numeric_limits<Tsub>::max();
- CHECK(kfr::satadd(min, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, min));
- CHECK(kfr::satadd(max, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, max));
- CHECK(kfr::satadd(min, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, max));
- CHECK(kfr::satadd(max, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, min));
-
- CHECK(kfr::satsub(min, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, min));
- CHECK(kfr::satsub(max, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, max));
- CHECK(kfr::satsub(min, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, max));
- CHECK(kfr::satsub(max, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, min));
- });
-
- testo::matrix(named("type") = unsigned_types, [](auto type) {
- using T = type_of<decltype(type)>;
- using Tsub = subtype<T>;
- const T& min = std::numeric_limits<Tsub>::min();
- const T& max = std::numeric_limits<Tsub>::max();
- CHECK(kfr::satadd(min, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, min));
- CHECK(kfr::satadd(max, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, max));
- CHECK(kfr::satadd(min, max) == apply([](auto x, auto y) { return ref_satadd(x, y); }, min, max));
- CHECK(kfr::satadd(max, min) == apply([](auto x, auto y) { return ref_satadd(x, y); }, max, min));
-
- CHECK(kfr::satsub(min, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, min));
- CHECK(kfr::satsub(max, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, max));
- CHECK(kfr::satsub(min, max) == apply([](auto x, auto y) { return ref_satsub(x, y); }, min, max));
- CHECK(kfr::satsub(max, min) == apply([](auto x, auto y) { return ref_satsub(x, y); }, max, min));
- });
+ testo::matrix(named("type") = cconcat(signed_vector_types<vec>, unsigned_vector_types<vec>),
+ [](auto type) {
+ using T = type_of<decltype(type)>;
+ using Tsub = subtype<T>;
+ const T min = std::numeric_limits<Tsub>::min();
+ const T max = std::numeric_limits<Tsub>::max();
+ CHECK(kfr::satadd(min, min) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, min, min));
+ CHECK(kfr::satadd(max, max) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, max, max));
+ CHECK(kfr::satadd(min, max) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, min, max));
+ CHECK(kfr::satadd(max, min) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satadd(x, y); }, max, min));
+
+ CHECK(kfr::satsub(min, min) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, min, min));
+ CHECK(kfr::satsub(max, max) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, max, max));
+ CHECK(kfr::satsub(min, max) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, min, max));
+ CHECK(kfr::satsub(max, min) ==
+ apply([](auto x, auto y) -> decltype(x) { return ref_satsub(x, y); }, max, min));
+ });
}
TEST(intrin_any_all)
{
- testo::matrix(named("type") = unsigned_types, [](auto type) {
+ testo::matrix(named("type") = unsigned_vector_types<vec>, [](auto type) {
using T = type_of<decltype(type)>;
constexpr size_t width = widthof<T>();
using Tsub = subtype<T>;
@@ -328,74 +163,7 @@ TEST(intrin_any_all)
});
}
-TEST(intrin_math)
-{
- testo::assert_is_same<decltype(pack(11) * pack(0.5)), f64x1>();
- testo::assert_is_same<decltype(pack(11) * 0.5), f64x1>();
- testo::assert_is_same<decltype(kfr::sin(2)), fbase>();
- testo::assert_is_same<decltype(kfr::sin(pack(2))), vec<fbase, 1>>();
- testo::assert_is_same<decltype(kfr::sindeg(2)), fbase>();
- testo::assert_is_same<decltype(kfr::sindeg(pack(2))), vec<fbase, 1>>();
-
- CHECK(pack(11) * pack(0.5) == 5.5);
- CHECK(pack(11) * 0.5 == 5.5);
- CHECK(kfr::sin(2) == fbase(0.90929742682568169539601986591174));
- CHECK(kfr::sin(pack(2)) == pack(fbase(0.90929742682568169539601986591174)));
- CHECK(kfr::sindeg(2) == fbase(0.03489949670250097164599518162533));
- CHECK(kfr::sindeg(pack(2)) == pack(fbase(0.03489949670250097164599518162533)));
- CHECK(kfr::cos(2) == fbase(-0.41614683654714238699756822950076));
- CHECK(kfr::cos(pack(2)) == pack(fbase(-0.41614683654714238699756822950076)));
- CHECK(kfr::cosdeg(2) == fbase(0.99939082701909573000624344004393));
- CHECK(kfr::cosdeg(pack(2)) == pack(fbase(0.99939082701909573000624344004393)));
-
- CHECK(kfr::log(2) == fbase(0.6931471805599453));
- CHECK(kfr::log(pack(2)) == pack(fbase(0.6931471805599453)));
- CHECK(kfr::log2(2) == fbase(1.0));
- CHECK(kfr::log2(pack(2)) == pack(fbase(1.0)));
- CHECK(kfr::log10(2) == fbase(0.30102999566398119521373889472449));
- CHECK(kfr::log10(pack(2)) == pack(fbase(0.30102999566398119521373889472449)));
-
- CHECK(kfr::exp(2) == fbase(7.3890560989306502));
- CHECK(kfr::exp(pack(2)) == pack(fbase(7.3890560989306502)));
- CHECK(kfr::exp2(2) == fbase(4.0));
- CHECK(kfr::exp2(pack(2)) == pack(fbase(4.0)));
-
- CHECK(kfr::logn(2, 10) == fbase(0.30102999566398119521373889472449));
- CHECK(kfr::logn(pack(2), pack(10)) == pack(fbase(0.30102999566398119521373889472449)));
-
- CHECK(kfr::pow(2, fbase(0.9)) == fbase(1.8660659830736148319626865322999));
- CHECK(kfr::pow(pack(2), pack(fbase(0.9))) == pack(fbase(1.8660659830736148319626865322999)));
-
- CHECK(kfr::root(fbase(1.5), 2) == fbase(1.2247448713915890490986420373529));
- CHECK(kfr::root(pack(fbase(1.5)), pack(2)) == pack(fbase(1.2247448713915890490986420373529)));
-
- testo::epsilon<float>() *= 10.0;
- testo::epsilon<double>() *= 10.0;
-
- CHECK(kfr::sinh(2) == fbase(3.6268604078470187676682139828013));
- CHECK(kfr::sinh(pack(2)) == pack(fbase(3.6268604078470187676682139828013)));
- CHECK(kfr::cosh(2) == fbase(3.7621956910836314595622134777737));
- CHECK(kfr::cosh(pack(2)) == pack(fbase(3.7621956910836314595622134777737)));
-
- CHECK(kfr::tanh(2) == fbase(0.96402758007581688394641372410092));
- CHECK(kfr::tanh(pack(2)) == pack(fbase(0.96402758007581688394641372410092)));
- CHECK(kfr::coth(2) == fbase(1.0373147207275480958778097647678));
- CHECK(kfr::coth(pack(2)) == pack(fbase(1.0373147207275480958778097647678)));
-
- testo::epsilon<float>() *= 10.0;
- testo::epsilon<double>() *= 10.0;
-
- CHECK(kfr::tan(2) == fbase(-2.1850398632615189916433061023137));
- CHECK(kfr::tan(pack(2)) == pack(fbase(-2.1850398632615189916433061023137)));
- CHECK(kfr::tandeg(2) == fbase(0.03492076949174773050040262577373));
- CHECK(kfr::tandeg(pack(2)) == pack(fbase(0.03492076949174773050040262577373)));
-
- testo::epsilon<float>() *= 10.0;
- testo::epsilon<double>() *= 10.0;
-
- CHECK(kfr::note_to_hertz(60) == fbase(261.6255653005986346778499935233));
- CHECK(kfr::note_to_hertz(pack(60)) == pack(fbase(261.6255653005986346778499935233)));
-}
+} // namespace CMT_ARCH_NAME
#ifndef KFR_NO_MAIN
int main()
diff --git a/tests/io_test.cpp b/tests/io_test.cpp
@@ -8,11 +8,13 @@
#include <kfr/base.hpp>
#include <kfr/cometa/function.hpp>
-#include <kfr/dsp.hpp>
#include <kfr/io.hpp>
using namespace kfr;
+namespace CMT_ARCH_NAME
+{
+
#if KFR_ENABLE_WAV
TEST(write_wav_file)
{
@@ -22,17 +24,17 @@ TEST(write_wav_file)
data = sin(counter() * 0.01f);
size_t wr = writer.write(data.data(), data.size());
CHECK(wr == data.size());
- CHECK(writer.format().length == data.size() / 2);
+ CHECK(umax(writer.format().length) == data.size() / 2);
}
TEST(read_wav_file)
{
audio_reader_wav<float> reader(open_file_for_reading(KFR_FILEPATH("temp_audio_file.wav")));
- CHECK(reader.format().channels == 2);
+ CHECK(reader.format().channels == 2u);
CHECK(reader.format().type == audio_sample_type::i16);
CHECK(reader.format().samplerate == 44100);
univector<float> data(44100 * 2);
- CHECK(reader.format().length == data.size() / 2);
+ CHECK(umax(reader.format().length) == data.size() / 2);
size_t rd = reader.read(data.data(), data.size());
CHECK(rd == data.size());
CHECK(absmaxof(data - render(sin(counter() * 0.01f), data.size())) < 0.0001f);
@@ -40,10 +42,10 @@ TEST(read_wav_file)
#endif
#if KFR_ENABLE_FLAC
-TEST(read_flac_file)
+DTEST(read_flac_file)
{
audio_reader_flac<float> reader(open_file_for_reading(KFR_FILEPATH("../../tests/test-audio/sine.flac")));
- CHECK(reader.format().channels == 2);
+ CHECK(reader.format().channels == 2u);
CHECK(reader.format().type == audio_sample_type::i32);
CHECK(reader.format().samplerate == 44100);
univector<float> data(44100 * 2);
@@ -53,6 +55,7 @@ TEST(read_flac_file)
CHECK(absmaxof(data - render(sin(counter() * 0.01f), data.size())) < 0.0001f);
}
#endif
+} // namespace CMT_ARCH_NAME
#ifndef KFR_NO_MAIN
int main()
diff --git a/tests/mpfr/mpfrplus.hpp b/tests/mpfr/mpfrplus.hpp
@@ -18,6 +18,7 @@ MPFR_DIAG_PRAGMA(ignored "-Wsign-conversion")
MPFR_DIAG_PRAGMA(pop)
#include <cmath>
#include <limits>
+#include <string>
#include <type_traits>
namespace mpfr
@@ -47,17 +48,14 @@ constexpr with_precision_t with_precision{};
namespace internal
{
-#ifndef MPFR_THREAD_LOCAL
-#define MPFR_THREAD_LOCAL thread_local
-#endif
-static mpfr_prec_t& precision()
+inline mpfr_prec_t& precision()
{
- static MPFR_THREAD_LOCAL mpfr_prec_t prec = mpfr_get_default_prec();
+ static mpfr_prec_t prec = mpfr_get_default_prec();
return prec;
}
-static mpfr_rnd_t& rounding_mode()
+inline mpfr_rnd_t& rounding_mode()
{
- static MPFR_THREAD_LOCAL mpfr_rnd_t rnd = mpfr_get_default_rounding_mode();
+ static mpfr_rnd_t rnd = mpfr_get_default_rounding_mode();
return rnd;
}
} // namespace internal
@@ -241,7 +239,7 @@ public:
MPFR_CXX_CTOR_T(mpfr_set_ui, unsigned int)
MPFR_CXX_CTOR_T(mpfr_set_si, long int)
MPFR_CXX_CTOR_T(mpfr_set_ui, unsigned long int)
-#if __INTMAX_MAX__ != __LONG_MAX__
+#ifdef _MPFR_H_HAVE_INTMAX_T
MPFR_CXX_CTOR_T(mpfr_set_sj, intmax_t)
MPFR_CXX_CTOR_T(mpfr_set_uj, uintmax_t)
#endif
@@ -253,7 +251,7 @@ public:
MPFR_CXX_ASGN_T(mpfr_set_ui, unsigned int)
MPFR_CXX_ASGN_T(mpfr_set_si, long int)
MPFR_CXX_ASGN_T(mpfr_set_ui, unsigned long int)
-#if __INTMAX_MAX__ != __LONG_MAX__
+#ifdef _MPFR_H_HAVE_INTMAX_T
MPFR_CXX_ASGN_T(mpfr_set_sj, intmax_t)
MPFR_CXX_ASGN_T(mpfr_set_uj, uintmax_t)
#endif
@@ -300,6 +298,15 @@ public:
{
return mpfr_get_ld(val, internal::rounding_mode());
}
+
+ std::string to_string() const
+ {
+ char* str;
+ mpfr_asprintf(&str, "%.*Rg", prec(), val);
+ std::string result = str;
+ mpfr_free_str(str);
+ return result;
+ }
};
#ifdef MPFR_USE_UDL
diff --git a/tests/multiarch.cpp b/tests/multiarch.cpp
@@ -7,7 +7,6 @@
#include <kfr/testo/testo.hpp>
#include <kfr/base.hpp>
-#include <kfr/cpuid.hpp>
#include <kfr/dsp.hpp>
#include <kfr/io.hpp>
diff --git a/tests/numeric_tests.hpp b/tests/numeric_tests.hpp
@@ -0,0 +1,123 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/io.hpp>
+#include <kfr/testo/testo.hpp>
+
+namespace kfr
+{
+
+using testo::test_data_entry;
+
+inline namespace CMT_ARCH_NAME
+{
+
+using vector_types =
+ ctypes_t<f32, f32x1, f32x2, f32x4, f32x8, f32x16, f32x32, f64, f64x1, f64x2, f64x4, f64x8, f64x16>;
+
+template <typename T>
+uint64_t ulps(T x, T y)
+{
+ if (std::abs(x) < std::numeric_limits<T>::min() && std::abs(y) < std::numeric_limits<T>::min())
+ return 0;
+ if (std::isnan(x) && std::isnan(y))
+ return 0;
+ if (std::isinf(x) && std::isinf(y))
+ return (x < 0) == (y < 0) ? 0 : ULLONG_MAX;
+ if (x < 0 && y < 0)
+ return ulps<T>(-x, -y);
+ if ((x < 0) != (y < 0))
+ return ulps<T>(std::abs(x), 0) + ulps<T>(std::abs(y), 0);
+
+ utype<T> ix = cometa::bitcast_anything<utype<T>>(x);
+ utype<T> iy = cometa::bitcast_anything<utype<T>>(y);
+ if (std::abs(x) < std::numeric_limits<T>::min() && y > std::numeric_limits<T>::min())
+ return 1 + ulps<T>(std::numeric_limits<T>::min(), y);
+ if (std::abs(x) > std::numeric_limits<T>::min() && y < std::numeric_limits<T>::min())
+ return 1 + ulps<T>(x, std::numeric_limits<T>::min());
+ return ix > iy ? ix - iy : iy - ix;
+}
+
+template <typename T, size_t N>
+uint64_t ulps(vec<T, N> x, vec<T, N> y)
+{
+ uint64_t u = 0;
+ for (size_t i = 0; i < N; i++)
+ {
+ u = std::max(u, ulps(x[i], y[i]));
+ }
+ return u;
+}
+
+inline const char* tname(ctype_t<f32>) { return "float"; }
+inline const char* tname(ctype_t<f64>) { return "double"; }
+
+#define CHECK_DIFF(x_arg, y_arg, threshold) \
+ do \
+ { \
+ ++checks_count; \
+ const auto x_arg_value = x_arg; \
+ const auto y_arg_value = y_arg; \
+ const auto arg_diff = ulps(x_arg_value, y_arg_value); \
+ error_sum += arg_diff; \
+ error_peak = std::max(error_peak, arg_diff); \
+ ::testo::active_test()->check( \
+ arg_diff <= threshold, \
+ ::cometa::as_string(x_arg_value, " ~= ", y_arg_value, " (", arg_diff, " <= ", threshold, ")"), \
+ #x_arg " ~= " #y_arg); \
+ } while (0)
+
+#define KFR_AUTO_TEST_1(fn, datafile, maxulps, avgulps) \
+ TEST(fn##_##datafile) \
+ { \
+ testo::matrix(named("type") = vector_types(), [&](auto type) { \
+ using T = type_of<decltype(type)>; \
+ using Tsub = subtype<T>; \
+ double error_sum = 0.0; \
+ uint64_t error_peak = 0; \
+ uint64_t checks_count = 0; \
+ std::shared_ptr<file_reader<test_data_entry<Tsub, 1>>> reader = \
+ open_file_for_reading<test_data_entry<Tsub, 1>>( \
+ std::string(KFR_SRC_DIR "/tests/data/" #fn "_") + tname(ctype<Tsub>) + "_" #datafile); \
+ test_data_entry<Tsub, 1> entry; \
+ while (reader->read(entry)) \
+ { \
+ testo::scope s(as_string(entry.arguments[0])); \
+ CHECK_DIFF(kfr::fn(entry.arguments[0]), entry.result, maxulps); \
+ } \
+ CHECK(checks_count > 0u); \
+ CHECK(error_sum / checks_count <= avgulps); \
+ println("measured accuracy: ", tname(ctype<Tsub>), " ", error_sum / checks_count, "(peak ", \
+ error_peak, ")"); \
+ }); \
+ }
+
+#define KFR_AUTO_TEST_2(fn, datafile, maxulps, avgulps) \
+ TEST(fn##_##datafile) \
+ { \
+ testo::matrix(named("type") = vector_types(), [&](auto type) { \
+ using T = type_of<decltype(type)>; \
+ using Tsub = subtype<T>; \
+ double error_sum = 0.0; \
+ uint64_t error_peak = 0; \
+ uint64_t checks_count = 0; \
+ std::shared_ptr<file_reader<test_data_entry<Tsub, 2>>> reader = \
+ open_file_for_reading<test_data_entry<Tsub, 2>>( \
+ std::string(KFR_SRC_DIR "/tests/data/" #fn "_") + tname(ctype<Tsub>) + "_" #datafile); \
+ test_data_entry<Tsub, 2> entry; \
+ while (reader->read(entry)) \
+ { \
+ testo::scope s(as_string(entry.arguments[0], entry.arguments[1])); \
+ CHECK_DIFF(kfr::fn(entry.arguments[0], entry.arguments[1]), entry.result, maxulps); \
+ } \
+ CHECK(checks_count > 0u); \
+ CHECK(error_sum / checks_count <= avgulps); \
+ println("measured accuracy: ", tname(ctype<Tsub>), " ", error_sum / checks_count, "(peak ", \
+ error_peak, ")"); \
+ }); \
+ }
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/resampler_test.cpp b/tests/resampler_test.cpp
@@ -1,37 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016 D Levin
- * See LICENSE.txt for details
- */
-
-#include <kfr/dsp.hpp>
-#include <kfr/io.hpp>
-#include <kfr/testo/testo.hpp>
-
-using namespace kfr;
-
-TEST(resampler_test)
-{
- const int in_sr = 44100;
- const int out_sr = 48000;
- const int freq = 100;
- auto resampler = sample_rate_converter<fbase>(resample_quality::draft, out_sr, in_sr);
- double delay = resampler.get_fractional_delay();
- univector<fbase> out(out_sr / 10);
- univector<fbase> in = truncate(sin(c_pi<fbase> * phasor<fbase>(freq, in_sr, 0)), in_sr / 10);
- univector<fbase> ref = truncate(
- sin(c_pi<fbase> * phasor<fbase>(freq, out_sr, -delay * (static_cast<double>(freq) / out_sr))),
- out_sr / 10);
- resampler.process(out, in);
-
- CHECK(rms(slice(out - ref, ceil(delay * 2))) < 0.005f);
-}
-
-#ifndef KFR_NO_MAIN
-int main()
-{
- println(library_version());
-
- return testo::run_all("", true);
-}
-#endif
diff --git a/tests/transcendental_test.cpp b/tests/transcendental_test.cpp
@@ -1,172 +0,0 @@
-/**
- * KFR (http://kfrlib.com)
- * Copyright (C) 2016 D Levin
- * See LICENSE.txt for details
- */
-
-#include <kfr/testo/testo.hpp>
-
-#include <kfr/base.hpp>
-#include <kfr/io.hpp>
-
-#define MPFR_THREAD_LOCAL
-#include "mpfr/mpfrplus.hpp"
-
-using namespace kfr;
-
-using vector_types = ctypes_t<f32, f64, f32x2, f32x8, f32x16, f64x2, f64x4, f64x8>;
-
-template <typename T>
-double ulps(T test, const mpfr::number& ref)
-{
- if (std::isnan(test) && ref.isnan())
- return 0;
- if (std::isinf(test) && ref.isinfinity())
- return (test < 0) == (ref < 0) ? 0 : NAN;
- return static_cast<double>(mpfr::abs(mpfr::number(test) - ref) /
- mpfr::abs(mpfr::number(test) - std::nexttoward(test, HUGE_VALL)));
-}
-
-template <typename T, size_t N>
-double ulps(const vec<T, N>& test, const mpfr::number& ref)
-{
- double u = 0;
- for (size_t i = 0; i < N; ++i)
- u = std::max(u, ulps(test[i], ref));
- return u;
-}
-
-TEST(test_sin_cos)
-{
- testo::matrix(named("type") = vector_types(),
- named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.05),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
- CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
- });
- testo::matrix(named("type") = vector_types(), named("value") = make_range(-100.0, 100.0, 0.5),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::sin(x), mpfr::sin(subtype<T>(value))) < 2.0);
- CHECK(ulps(kfr::cos(x), mpfr::cos(subtype<T>(value))) < 2.0);
- });
-}
-
-TEST(test_tan)
-{
- testo::matrix(named("type") = ctypes_t<f32>(),
- named("value") = make_range(0.0, +constants<f64>::pi * 2, 0.01),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 2.0);
- });
- testo::matrix(named("type") = ctypes_t<f32>(), named("value") = make_range(-100.0, 100.0, 0.5),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::tan(x), mpfr::tan(subtype<T>(value))) < 3.0);
- });
-}
-
-#ifdef __clang__
-#define ARCFN_ULP 2.0
-#else
-#define ARCFN_ULP 2.5
-#endif
-
-TEST(test_asin_acos_atan)
-{
- testo::matrix(named("type") = vector_types(), named("value") = make_range(-1.0, 1.0, 0.05),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::asin(x), mpfr::asin(subtype<T>(value))) < ARCFN_ULP);
- CHECK(ulps(kfr::acos(x), mpfr::acos(subtype<T>(value))) < ARCFN_ULP);
- CHECK(ulps(kfr::atan(x), mpfr::atan(subtype<T>(value))) < ARCFN_ULP);
- });
-}
-
-TEST(test_atan2)
-{
- testo::matrix(named("type") = vector_types(), named("value1") = make_range(-1.0, 1.0, 0.1),
- named("value2") = make_range(-1.0, 1.0, 0.1), [](auto type, double value1, double value2) {
- using T = type_of<decltype(type)>;
- const T x(value1);
- const T y(value2);
- CHECK(ulps(kfr::atan2(x, y), mpfr::atan2(subtype<T>(value1), subtype<T>(value2))) <
- ARCFN_ULP);
- });
-}
-
-TEST(test_log)
-{
- testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::log(x), mpfr::log(x)) < 2.0);
- });
-}
-
-TEST(test_log2)
-{
- testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::log2(x), mpfr::log2(x)) < 3.0);
- });
-}
-
-TEST(test_log10)
-{
- testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(0.0, 100.0, 0.5),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::log10(x), mpfr::log10(x)) < 3.0);
- });
-}
-
-TEST(test_exp)
-{
- testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::exp(x), mpfr::exp(x)) < 2.0);
- });
-}
-
-TEST(test_exp2)
-{
- testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::exp2(x), mpfr::exp2(x)) < 3.0);
- });
-}
-
-TEST(test_exp10)
-{
- testo::matrix(named("type") = ctypes_t<f32, f64>(), named("value") = make_range(-10, +10, 0.05),
- [](auto type, double value) {
- using T = type_of<decltype(type)>;
- const T x(value);
- CHECK(ulps(kfr::exp10(x), mpfr::exp10(x)) < 3.0);
- });
-}
-
-#ifndef KFR_NO_MAIN
-int main()
-{
- println(library_version(), " running on ", cpu_runtime());
- mpfr::scoped_precision p(128);
- return testo::run_all("");
-}
-#endif
diff --git a/tests/unit/base/conversion.cpp b/tests/unit/base/conversion.cpp
@@ -0,0 +1,67 @@
+#include <kfr/base/conversion.hpp>
+
+#include <kfr/base/basic_expressions.hpp>
+
+#include <kfr/base/reduce.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+TEST(sample_conversion)
+{
+ CHECK(convert_sample<float>(static_cast<i8>(-127)) == -1.f);
+ CHECK(convert_sample<float>(static_cast<i8>(0)) == 0.f);
+ CHECK(convert_sample<float>(static_cast<i8>(127)) == 1.f);
+
+ CHECK(convert_sample<float>(static_cast<i16>(-32767)) == -1.f);
+ CHECK(convert_sample<float>(static_cast<i16>(0)) == 0.f);
+ CHECK(convert_sample<float>(static_cast<i16>(32767)) == 1.f);
+
+ CHECK(convert_sample<float>(static_cast<i24>(-8388607)) == -1.f);
+ CHECK(convert_sample<float>(static_cast<i24>(0)) == 0.f);
+ CHECK(convert_sample<float>(static_cast<i24>(8388607)) == 1.f);
+
+ CHECK(convert_sample<float>(static_cast<i32>(-2147483647)) == -1.f);
+ CHECK(convert_sample<float>(static_cast<i32>(0)) == 0.f);
+ CHECK(convert_sample<float>(static_cast<i32>(2147483647)) == 1.f);
+
+ CHECK(convert_sample<i8>(-1.f) == -127);
+ CHECK(convert_sample<i8>(0.f) == 0);
+ CHECK(convert_sample<i8>(1.f) == 127);
+
+ CHECK(convert_sample<i16>(-1.f) == -32767);
+ CHECK(convert_sample<i16>(0.f) == 0);
+ CHECK(convert_sample<i16>(1.f) == 32767);
+
+ CHECK(convert_sample<i24>(-1.f) == -8388607);
+ CHECK(convert_sample<i24>(0.f) == 0);
+ CHECK(convert_sample<i24>(1.f) == 8388607);
+
+ CHECK(convert_sample<i32>(-1.f) == -2147483647);
+ CHECK(convert_sample<i32>(0.f) == 0);
+ CHECK(convert_sample<i32>(1.f) == 2147483647);
+}
+
+TEST(sample_interleave_deinterleave)
+{
+ const size_t size = 50;
+ univector2d<float> in;
+ in.push_back(truncate(counter() * 3.f + 0.f, size));
+ in.push_back(truncate(counter() * 3.f + 1.f, size));
+ in.push_back(truncate(counter() * 3.f + 2.f, size));
+ univector<float> out(size * 3);
+ interleave(out.data(), std::array<const float*, 3>{ in[0].data(), in[1].data(), in[2].data() }.data(), 3,
+ size);
+ CHECK(maxof(out - render(counter() * 1.f, out.size())) == 0);
+
+ deinterleave(std::array<float*, 3>{ in[0].data(), in[1].data(), in[2].data() }.data(), out.data(), 3,
+ size);
+
+ CHECK(absmaxof(in[0] - render(counter() * 3.f + 0.f, size)) == 0);
+ CHECK(absmaxof(in[1] - render(counter() * 3.f + 1.f, size)) == 0);
+ CHECK(absmaxof(in[2] - render(counter() * 3.f + 2.f, size)) == 0);
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/base/reduce.cpp b/tests/unit/base/reduce.cpp
@@ -0,0 +1,41 @@
+#include <kfr/base/reduce.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+
+TEST(reduce)
+{
+ {
+ univector<float, 5> a({ 1, 2, 3, 4, 5 });
+ CHECK(sum(a) == 15);
+ CHECK(mean(a) == 3);
+ CHECK(minof(a) == 1);
+ CHECK(maxof(a) == 5);
+ CHECK(sumsqr(a) == 55);
+ CHECK(rms(a) == 3.316624790355399849115f);
+ CHECK(product(a) == 120);
+ }
+ {
+ univector<double, 5> a({ 1, 2, 3, 4, 5 });
+ CHECK(sum(a) == 15);
+ CHECK(mean(a) == 3);
+ CHECK(minof(a) == 1);
+ CHECK(maxof(a) == 5);
+ CHECK(sumsqr(a) == 55);
+ CHECK(rms(a) == 3.316624790355399849115);
+ CHECK(product(a) == 120);
+ }
+ {
+ univector<int, 5> a({ 1, 2, 3, 4, 5 });
+ CHECK(sum(a) == 15);
+ CHECK(mean(a) == 3);
+ CHECK(minof(a) == 1);
+ CHECK(maxof(a) == 5);
+ CHECK(sumsqr(a) == 55);
+ CHECK(product(a) == 120);
+ }
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/abs.cpp b/tests/unit/math/abs.cpp
@@ -0,0 +1,13 @@
+#include <kfr/math/abs.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(abs)
+{
+ test_function1(test_catogories::all, [](auto x) { return kfr::abs(x); },
+ [](auto x) -> decltype(x) { return x >= 0 ? x : -x; });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/asin_acos.cpp b/tests/unit/math/asin_acos.cpp
@@ -0,0 +1,18 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/asin_acos.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(asin, narrow, 6, 1)
+KFR_AUTO_TEST_1(acos, narrow, 800, 1)
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/tests/unit/math/atan.cpp b/tests/unit/math/atan.cpp
@@ -0,0 +1,18 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/atan.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(atan, narrow, 2, 1)
+KFR_AUTO_TEST_2(atan2, narrow, 2, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/hyperbolic.cpp b/tests/unit/math/hyperbolic.cpp
@@ -0,0 +1,21 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/hyperbolic.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(sinh, narrow, 114, 2.5)
+KFR_AUTO_TEST_1(cosh, narrow, 7, 2.5)
+KFR_AUTO_TEST_1(tanh, narrow, 45, 1)
+KFR_AUTO_TEST_1(coth, narrow, 85, 1)
+} // namespace CMT_ARCH_NAME
+
+} // namespace kfr
diff --git a/tests/unit/math/log_exp.cpp b/tests/unit/math/log_exp.cpp
@@ -0,0 +1,23 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/log_exp.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(gamma, narrow, 2200, 321)
+KFR_AUTO_TEST_1(exp, narrow, 4, 2)
+KFR_AUTO_TEST_1(exp2, narrow, 5, 2)
+KFR_AUTO_TEST_1(exp10, narrow, 40, 10)
+KFR_AUTO_TEST_1(log, narrow, 2, 1)
+KFR_AUTO_TEST_1(log2, narrow, 2, 1)
+KFR_AUTO_TEST_1(log10, narrow, 3, 1)
+KFR_AUTO_TEST_1(cbrt, narrow, 5, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/min_max.cpp b/tests/unit/math/min_max.cpp
@@ -0,0 +1,39 @@
+#include <kfr/math/min_max.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(min)
+{
+ test_function2(test_catogories::all, [](auto x, auto y) { return kfr::min(x, y); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x <= y ? x : y; });
+}
+
+TEST(max)
+{
+ test_function2(test_catogories::all, [](auto x, auto y) { return kfr::max(x, y); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x >= y ? x : y; });
+}
+
+TEST(absmin)
+{
+ test_function2(test_catogories::all, [](auto x, auto y) { return kfr::absmin(x, y); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ x = x >= 0 ? x : -x;
+ y = y >= 0 ? y : -y;
+ return x <= y ? x : y;
+ });
+}
+
+TEST(absmax)
+{
+ test_function2(test_catogories::all, [](auto x, auto y) { return kfr::absmax(x, y); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ x = x >= 0 ? x : -x;
+ y = y >= 0 ? y : -y;
+ return x >= y ? x : y;
+ });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/round.cpp b/tests/unit/math/round.cpp
@@ -0,0 +1,53 @@
+#include <kfr/math/round.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(floor)
+{
+ test_function1(test_catogories::all, [](auto x) { return kfr::floor(x); },
+ [](auto x) -> decltype(x) {
+ return std::is_integral<decltype(x)>::value ? x
+ : static_cast<decltype(x)>(std::floor(x));
+ });
+}
+
+TEST(ceil)
+{
+ test_function1(test_catogories::all, [](auto x) { return kfr::ceil(x); },
+ [](auto x) -> decltype(x) {
+ return std::is_integral<decltype(x)>::value ? x
+ : static_cast<decltype(x)>(std::ceil(x));
+ });
+}
+
+TEST(trunc)
+{
+ test_function1(test_catogories::all, [](auto x) { return kfr::trunc(x); },
+ [](auto x) -> decltype(x) {
+ return std::is_integral<decltype(x)>::value ? x
+ : static_cast<decltype(x)>(std::trunc(x));
+ });
+}
+
+TEST(round)
+{
+ test_function1(test_catogories::all, [](auto x) { return kfr::round(x); },
+ [](auto x) -> decltype(x) {
+ return std::is_integral<decltype(x)>::value ? x
+ : static_cast<decltype(x)>(std::round(x));
+ });
+}
+
+TEST(fract)
+{
+ test_function1(test_catogories::all, [](auto x) { return kfr::fract(x); },
+ [](auto x) -> decltype(x) {
+ return std::is_integral<decltype(x)>::value
+ ? 0
+ : static_cast<decltype(x)>(x - std::floor(x));
+ });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/select.cpp b/tests/unit/math/select.cpp
@@ -0,0 +1,27 @@
+#include <kfr/math/select.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(select_true)
+{
+ test_function2(test_catogories::vectors,
+ [](auto x, auto y) {
+ mask<subtype<decltype(x)>, decltype(x)::scalar_size()> m(true);
+ return kfr::select(m, x, y);
+ },
+ [](auto x, auto) { return x; });
+}
+
+TEST(select_false)
+{
+ test_function2(test_catogories::vectors,
+ [](auto x, auto y) {
+ mask<subtype<decltype(x)>, decltype(x)::scalar_size()> m(false);
+ return kfr::select(m, x, y);
+ },
+ [](auto, auto y) { return y; });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/sin_cos.cpp b/tests/unit/math/sin_cos.cpp
@@ -0,0 +1,17 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/sin_cos.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(sin, narrow, 7, 1)
+KFR_AUTO_TEST_1(cos, narrow, 2, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/math/tan.cpp b/tests/unit/math/tan.cpp
@@ -0,0 +1,16 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+#include "../../numeric_tests.hpp"
+
+#include <kfr/math/tan.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+KFR_AUTO_TEST_1(tan, narrow, 7, 1)
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/complex.cpp b/tests/unit/simd/complex.cpp
@@ -0,0 +1,33 @@
+#include <kfr/simd/complex.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(complex_convertible)
+{
+ static_assert(std::is_convertible<float, complex<float>>::value, "");
+ static_assert(std::is_convertible<float, complex<double>>::value, "");
+ static_assert(std::is_convertible<short, complex<double>>::value, "");
+
+ static_assert(std::is_convertible<complex<float>, vec<complex<float>, 4>>::value, "");
+ static_assert(!std::is_convertible<vec<complex<float>, 1>, vec<complex<float>, 4>>::value, "");
+
+ static_assert(std::is_convertible<vec<complex<float>, 2>, vec<complex<double>, 2>>::value, "");
+ static_assert(std::is_convertible<vec<vec<float, 4>, 2>, vec<vec<double, 4>, 2>>::value, "");
+
+ CHECK(static_cast<complex<float>>(10.f) == complex<float>{ 10.f, 0.f });
+ CHECK(static_cast<complex<double>>(10.f) == complex<double>{ 10., 0. });
+ CHECK(static_cast<complex<double>>(static_cast<short>(10)) == complex<double>{ 10., 0. });
+
+ CHECK(static_cast<vec<complex<float>, 2>>(complex<float>{ 1.f, 2.f }) ==
+ vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
+
+ CHECK(static_cast<vec<complex<float>, 4>>(complex<float>{ 1.f, 2.f }) ==
+ vec<complex<float>, 4>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f }, c32{ 1.f, 2.f } });
+
+ CHECK(static_cast<vec<complex<double>, 2>>(vec<complex<float>, 2>{ c32{ 1.f, 2.f }, c32{ 1.f, 2.f } }) ==
+ vec<complex<double>, 2>{ c64{ 1., 2. }, c64{ 1., 2. } });
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/operators.cpp b/tests/unit/simd/operators.cpp
@@ -0,0 +1,220 @@
+#include <kfr/simd/horizontal.hpp>
+#include <kfr/simd/operators.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(neg)
+{
+ test_function1(test_catogories::vectors, [](auto x) -> decltype(x) { return -x; },
+ [](auto x) -> decltype(x) { return -x; });
+}
+
+TEST(bnot)
+{
+ test_function1(test_catogories::vectors, [](auto x) -> decltype(x) { return ~x; },
+ [](auto x) -> decltype(x) {
+ utype<decltype(x)> u = ~ubitcast(x);
+ return bitcast<decltype(x)>(u);
+ });
+}
+
+TEST(add)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return x + y; },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x + y; });
+}
+
+TEST(sub)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return x - y; },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x - y; });
+}
+
+TEST(mul)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return x * y; },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> { return x * y; });
+}
+
+template <typename T>
+inline bool is_safe_division(T x, T y)
+{
+ return y != T(0) && !(std::is_signed<T>::value && x == std::numeric_limits<T>::min() && y == T(-1));
+}
+
+TEST(div)
+{
+ test_function2(test_catogories::vectors,
+ [](auto x, auto y) {
+ return is_safe_division<subtype<decltype(x)>>(x.front(), y.front()) ? x / y : 0;
+ },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return is_safe_division(x, y) ? x / y : 0;
+ });
+}
+
+TEST(bor)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return x | y; },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ using T = common_type<decltype(x), decltype(y)>;
+ return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) | ubitcast(T(y))));
+ });
+}
+
+TEST(bxor)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return x ^ y; },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ using T = common_type<decltype(x), decltype(y)>;
+ return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) ^ ubitcast(T(y))));
+ });
+}
+
+TEST(band)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return x & y; },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ using T = common_type<decltype(x), decltype(y)>;
+ return bitcast<T>(static_cast<utype<T>>(ubitcast(T(x)) & ubitcast(T(y))));
+ });
+}
+
+TEST(shl)
+{
+ testo::matrix(
+ named("type") = test_catogories::types(test_catogories::vectors), named("value1") = special_values(),
+ named("shift") = std::vector<unsigned>{ 1, 2, 7, 8, 9, 15, 16, 31, 32, 63, 64 },
+ [&](auto type, special_value value, unsigned shift) {
+ using T = type_of<decltype(type)>;
+ if (shift < sizeof(subtype<T>))
+ {
+ const T x(value);
+ CHECK(std::is_same<decltype(x << shift), T>::value);
+ CHECK((x << shift) == apply(
+ [=](auto x) -> decltype(x) {
+ return bitcast<decltype(x)>(
+ static_cast<uitype<decltype(x)>>(uibitcast(x) << shift));
+ },
+ x));
+ CHECK((x << broadcast<T::scalar_size()>(utype<subtype<T>>(shift))) ==
+ apply(
+ [=](auto x) -> decltype(x) {
+ return bitcast<decltype(x)>(
+ static_cast<uitype<decltype(x)>>(uibitcast(x) << shift));
+ },
+ x));
+ }
+ });
+}
+
+TEST(shr)
+{
+ testo::matrix(
+ named("type") = test_catogories::types(test_catogories::vectors), named("value1") = special_values(),
+ named("shift") = std::vector<unsigned>{ 1, 2, 7, 8, 9, 15, 16, 31, 32, 63, 64 },
+ [&](auto type, special_value value, unsigned shift) {
+ using T = type_of<decltype(type)>;
+ if (shift < sizeof(subtype<T>))
+ {
+ const T x(value);
+ CHECK(std::is_same<decltype(x << shift), T>::value);
+ CHECK((x >> shift) == apply(
+ [=](auto x) -> decltype(x) {
+ return bitcast<decltype(x)>(
+ static_cast<uitype<decltype(x)>>(uibitcast(x) >> shift));
+ },
+ x));
+ CHECK((x >> broadcast<T::scalar_size()>(utype<subtype<T>>(shift))) ==
+ apply(
+ [=](auto x) -> decltype(x) {
+ return bitcast<decltype(x)>(
+ static_cast<uitype<decltype(x)>>(uibitcast(x) >> shift));
+ },
+ x));
+ }
+ });
+}
+
+TEST(eq)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return (x == y).asvec(); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return internal::maskbits<subtype<decltype(x)>>(x == y);
+ });
+}
+
+TEST(ne)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return (x != y).asvec(); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return internal::maskbits<subtype<decltype(x)>>(x != y);
+ });
+}
+
+TEST(ge)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return (x >= y).asvec(); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return internal::maskbits<subtype<decltype(x)>>(x >= y);
+ });
+}
+
+TEST(le)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return (x <= y).asvec(); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return internal::maskbits<subtype<decltype(x)>>(x <= y);
+ });
+}
+
+TEST(gt)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return (x > y).asvec(); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return internal::maskbits<subtype<decltype(x)>>(x > y);
+ });
+}
+
+TEST(lt)
+{
+ test_function2(test_catogories::vectors, [](auto x, auto y) { return (x < y).asvec(); },
+ [](auto x, auto y) -> common_type<decltype(x), decltype(y)> {
+ return internal::maskbits<subtype<decltype(x)>>(x < y);
+ });
+}
+
+TEST(horner)
+{
+ CHECK(horner(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 17, 34));
+ CHECK(horner_odd(pack(0, 1, 2, 3), 1, 2, 3) == pack(0, 6, 114, 786));
+ CHECK(horner_even(pack(0, 1, 2, 3), 1, 2, 3) == pack(1, 6, 57, 262));
+}
+
+TEST(matrix)
+{
+ using i32x2x2 = vec<vec<int, 2>, 2>;
+ const i32x2x2 m22{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
+ CHECK(m22 * 10 == i32x2x2{ i32x2{ 10, 20 }, i32x2{ 30, 40 } });
+
+ CHECK(m22 * i32x2{ -1, 100 } == i32x2x2{ i32x2{ -1, 200 }, i32x2{ -3, 400 } });
+
+ i32x2 xy{ 10, 20 };
+ i32x2x2 m{ i32x2{ 1, 2 }, i32x2{ 3, 4 } };
+ xy = hadd(xy * m);
+ CHECK(xy == i32x2{ 40, 120 });
+
+ i32x2 xy2{ 10, 20 };
+ xy2 = hadd(transpose(xy2 * m));
+ CHECK(xy2 == i32x2{ 50, 110 });
+}
+
+TEST(apply)
+{
+ CHECK(apply([](int x) { return x + 1; }, make_vector(1, 2, 3, 4, 5)) == make_vector(2, 3, 4, 5, 6));
+ CHECK(apply(fn::sqr(), make_vector(1, 2, 3, 4, 5)) == make_vector(1, 4, 9, 16, 25));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/shuffle.cpp b/tests/unit/simd/shuffle.cpp
@@ -0,0 +1,160 @@
+#include <kfr/simd/shuffle.hpp>
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(concat)
+{
+ CHECK(concat(vec<f32, 1>{ 1 }, vec<f32, 2>{ 2, 3 }, vec<f32, 1>{ 4 }, vec<f32, 3>{ 5, 6, 7 }) //
+ == vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 });
+}
+
+TEST(reverse)
+{
+ CHECK(reverse(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(7, 6, 5, 4, 3, 2, 1, 0));
+ CHECK(reverse<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(6, 7, 4, 5, 2, 3, 0, 1));
+ CHECK(reverse<4>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(4, 5, 6, 7, 0, 1, 2, 3));
+}
+
+TEST(shuffle)
+{
+ const vec<int, 8> numbers1 = enumerate<int, 8>();
+ const vec<int, 8> numbers2 = enumerate<int, 8, 100>();
+ CHECK(shuffle(numbers1, numbers2, elements_t<0, 8, 2, 10, 4, 12, 6, 14>()) ==
+ vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+ CHECK(shuffle(numbers1, numbers2, elements_t<0, 8>()) == vec<int, 8>{ 0, 100, 2, 102, 4, 104, 6, 106 });
+}
+
+TEST(permute)
+{
+ const vec<int, 8> numbers1 = enumerate<int, 8>();
+ CHECK(permute(numbers1, elements_t<0, 2, 1, 3, 4, 6, 5, 7>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+ CHECK(permute(numbers1, elements_t<0, 2, 1, 3>()) == vec<int, 8>{ 0, 2, 1, 3, 4, 6, 5, 7 });
+}
+
+TEST(blend)
+{
+ const vec<int, 8> numbers1 = enumerate<int, 8>();
+ const vec<int, 8> numbers2 = enumerate<int, 8, 100>();
+ CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1, 0, 1, 1, 0, 1>()) ==
+ vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
+ CHECK(blend(numbers1, numbers2, elements_t<0, 1, 1>()) ==
+ vec<int, 8>{ 0, 101, 102, 3, 104, 105, 6, 107 });
+}
+
+TEST(duplicate_shuffle)
+{
+ CHECK(dup(pack(0, 1, 2, 3)) == pack(0, 0, 1, 1, 2, 2, 3, 3));
+ CHECK(duphalfs(pack(0, 1, 2, 3)) == pack(0, 1, 2, 3, 0, 1, 2, 3));
+ CHECK(dupeven(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 0, 2, 2, 4, 4, 6, 6));
+ CHECK(dupodd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 1, 3, 3, 5, 5, 7, 7));
+}
+
+TEST(split_interleave)
+{
+ vec<f32, 1> a1;
+ vec<f32, 2> a23;
+ vec<f32, 1> a4;
+ vec<f32, 3> a567;
+ split(vec<f32, 7>{ 1, 2, 3, 4, 5, 6, 7 }, a1, a23, a4, a567);
+ CHECK(a1 == vec<f32, 1>{ 1 });
+ CHECK(a23 == vec<f32, 2>{ 2, 3 });
+ CHECK(a4 == vec<f32, 1>{ 4 });
+ CHECK(a567 == vec<f32, 3>{ 5, 6, 7 });
+
+ CHECK(splitpairs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6, 1, 3, 5, 7));
+ CHECK(splitpairs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
+
+ CHECK(interleavehalfs(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 4, 1, 5, 2, 6, 3, 7));
+ CHECK(interleavehalfs<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5, 2, 3, 6, 7));
+}
+
+TEST(broadcast)
+{
+ CHECK(broadcast<8>(1) == pack(1, 1, 1, 1, 1, 1, 1, 1));
+ CHECK(broadcast<8>(1, 2) == pack(1, 2, 1, 2, 1, 2, 1, 2));
+ CHECK(broadcast<8>(1, 2, 3, 4) == pack(1, 2, 3, 4, 1, 2, 3, 4));
+ CHECK(broadcast<8>(1, 2, 3, 4, 5, 6, 7, 8) == pack(1, 2, 3, 4, 5, 6, 7, 8));
+
+ CHECK(broadcast<5>(3.f) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
+ CHECK(broadcast<6>(1.f, 2.f) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
+ CHECK(broadcast<6>(1.f, 2.f, 3.f) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
+}
+
+TEST(resize)
+{
+ CHECK(resize<5>(make_vector(3.f)) == vec<f32, 5>{ 3, 3, 3, 3, 3 });
+ CHECK(resize<6>(make_vector(1.f, 2.f)) == vec<f32, 6>{ 1, 2, 1, 2, 1, 2 });
+ CHECK(resize<6>(make_vector(1.f, 2.f, 3.f)) == vec<f32, 6>{ 1, 2, 3, 1, 2, 3 });
+}
+
+TEST(make_vector)
+{
+ const signed char ch = -1;
+ CHECK(make_vector(1, 2, ch) == vec<i32, 3>{ 1, 2, -1 });
+ const i64 v = -100;
+ CHECK(make_vector(1, 2, v) == vec<i64, 3>{ 1, 2, -100 });
+ CHECK(make_vector<i64>(1, 2, ch) == vec<i64, 3>{ 1, 2, -1 });
+ CHECK(make_vector<f32>(1, 2, ch) == vec<f32, 3>{ 1, 2, -1 });
+
+ CHECK(make_vector(f64x2{ 1, 2 }, f64x2{ 10, 20 }) ==
+ vec<vec<f64, 2>, 2>{ f64x2{ 1, 2 }, f64x2{ 10, 20 } });
+ CHECK(make_vector(1.f, f32x2{ 10, 20 }) == vec<vec<f32, 2>, 2>{ f32x2{ 1, 1 }, f32x2{ 10, 20 } });
+}
+
+TEST(zerovector)
+{
+ CHECK(zerovector<f32, 3>() == f32x3{ 0, 0, 0 });
+ // CHECK(zerovector<i16, 3>() == i16x3{ 0, 0, 0 }); // clang 3.9 (trunk) crashes here
+ CHECK(zerovector(f64x8{}) == f64x8{ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(allonesvector)
+{
+ CHECK(bitcast<u32>(special_constants<f32>::allones()) == 0xFFFFFFFFu);
+ CHECK(bitcast<u64>(special_constants<f64>::allones()) == 0xFFFFFFFFFFFFFFFFull);
+
+ CHECK(allonesvector<i16, 3>() == i16x3{ -1, -1, -1 });
+ CHECK(allonesvector<u8, 3>() == u8x3{ 255, 255, 255 });
+}
+
+TEST(transpose)
+{
+ const auto sixteen = enumerate<float, 16>();
+ CHECK(transpose<4>(sixteen) == vec<float, 16>(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));
+}
+
+TEST(odd_even)
+{
+ CHECK(even(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 2, 4, 6));
+ CHECK(odd(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(1, 3, 5, 7));
+
+ CHECK(even<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(0, 1, 4, 5));
+ CHECK(odd<2>(pack(0, 1, 2, 3, 4, 5, 6, 7)) == pack(2, 3, 6, 7));
+}
+
+TEST(low_high)
+{
+ CHECK(low(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(1, 2, 3, 4));
+ CHECK(high(vec<u8, 8>(1, 2, 3, 4, 5, 6, 7, 8)) == vec<u8, 4>(5, 6, 7, 8));
+
+ CHECK(low(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 4>(1, 2, 3, 4));
+ CHECK(high(vec<u8, 7>(1, 2, 3, 4, 5, 6, 7)) == vec<u8, 3>(5, 6, 7));
+
+ CHECK(low(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 4>(1, 2, 3, 4));
+ CHECK(high(vec<u8, 6>(1, 2, 3, 4, 5, 6)) == vec<u8, 2>(5, 6));
+
+ CHECK(low(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 4>(1, 2, 3, 4));
+ CHECK(high(vec<u8, 5>(1, 2, 3, 4, 5)) == vec<u8, 1>(5));
+
+ CHECK(low(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(1, 2));
+ CHECK(high(vec<u8, 4>(1, 2, 3, 4)) == vec<u8, 2>(3, 4));
+
+ CHECK(low(vec<u8, 3>(1, 2, 3)) == vec<u8, 2>(1, 2));
+ CHECK(high(vec<u8, 3>(1, 2, 3)) == vec<u8, 1>(3));
+
+ CHECK(low(vec<u8, 2>(1, 2)) == vec<u8, 1>(1));
+ CHECK(high(vec<u8, 2>(1, 2)) == vec<u8, 1>(2));
+}
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp
@@ -0,0 +1,114 @@
+#include <kfr/simd/vec.hpp>
+
+#include <kfr/io/tostring.hpp>
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+TEST(cones)
+{
+ CHECK(vec<int, 2>(cones) == vec<int, 2>(-1, -1));
+ CHECK(vec<float, 2>(cones) == vec<f32, 2>(bitcast<f32>(-1), bitcast<f32>(-1)));
+}
+TEST(vec_broadcast)
+{
+ CHECK(static_cast<f32x4>(4.f) == f32x4{ 4.f, 4.f, 4.f, 4.f });
+ CHECK(static_cast<f64x8>(4.f) == f64x8{ 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0 });
+ CHECK(static_cast<u8x3>(4.f) == u8x3{ 4, 4, 4 });
+}
+template <typename Tout, typename Tin>
+bool is_in_range_of(Tin x)
+{
+ return (is_f_class<Tin>::value && is_f_class<Tout>::value) || static_cast<Tin>(static_cast<Tout>(x)) == x;
+}
+
+TEST(cast)
+{
+ testo::assert_is_same<i32x4, kfr::common_type<i32x4>>();
+ testo::assert_is_same<u32x4, kfr::common_type<i32x4, u32x4>>();
+ testo::assert_is_same<f64x4, kfr::common_type<i32x4, u32x4, f64x4>>();
+
+ CHECK(static_cast<i32x4>(u16x4{ 1, 2, 3, 4 }) == i32x4{ 1, 2, 3, 4 });
+
+ CHECK(static_cast<vec<vec<double, 4>, 2>>(vec<vec<float, 4>, 2>{
+ vec<float, 4>{ 1.f, 2.f, 3.f, 4.f }, vec<float, 4>{ 11.f, 22.f, 33.f, 44.f } }) ==
+ vec<vec<double, 4>, 2>{ vec<double, 4>{ 1., 2., 3., 4. }, vec<double, 4>{ 11., 22., 33., 44. } });
+
+ static_assert(std::is_convertible<float, f32x4>::value, "");
+ static_assert(std::is_convertible<float, f64x8>::value, "");
+ static_assert(std::is_convertible<float, u8x3>::value, "");
+
+ static_assert(std::is_convertible<u16x4, i32x4>::value, "");
+ static_assert(!std::is_convertible<u16x4, i32x3>::value, "");
+ static_assert(!std::is_convertible<u16x1, u16x16>::value, "");
+
+ static_assert(is_same<decltype(innercast<f64>(f32x4x4(1))), f64x4x4>::value, "");
+ static_assert(is_same<decltype(innercast<f64>(f32x4(1))), f64x4>::value, "");
+ static_assert(is_same<decltype(innercast<f64>(f32(1))), f64>::value, "");
+
+ // N/A static_assert(is_same<decltype(innercast<f64x4>(f32x4x4(1))), f64x4x4>::value, "");
+ static_assert(is_same<decltype(innercast<f64x4>(f32x4(1))), f64x4x4>::value, "");
+ static_assert(is_same<decltype(innercast<f64x4>(f32(1))), f64x4>::value, "");
+
+ // N/A static_assert(is_same<decltype(elemcast<f64>(f32x4x4(1))), f64x4>::value, "");
+ static_assert(is_same<decltype(elemcast<f64>(f32x4(1))), f64x4>::value, "");
+
+ static_assert(is_same<decltype(elemcast<f64x4>(f32x4x4(1))), f64x4x4>::value, "");
+ static_assert(is_same<decltype(elemcast<f64x4>(f32x4(1))), f64x4x4>::value, "");
+
+ testo::scope s("");
+ s.text = ("target_type = u8");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<u8>(x); },
+ [](auto x) -> u8 { return static_cast<u8>(x); },
+ [](auto t, special_value x) { return is_in_range_of<u8>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = i8");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<i8>(x); },
+ [](auto x) -> i8 { return static_cast<i8>(x); },
+ [](auto t, special_value x) { return is_in_range_of<i8>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = u16");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<u16>(x); },
+ [](auto x) -> u16 { return static_cast<u16>(x); },
+ [](auto t, special_value x) { return is_in_range_of<u16>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = i16");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<i16>(x); },
+ [](auto x) -> i16 { return static_cast<i16>(x); },
+ [](auto t, special_value x) { return is_in_range_of<i16>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = u32");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<u32>(x); },
+ [](auto x) -> u32 { return static_cast<u32>(x); },
+ [](auto t, special_value x) { return is_in_range_of<u32>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = i32");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<i32>(x); },
+ [](auto x) -> i32 { return static_cast<i32>(x); },
+ [](auto t, special_value x) { return is_in_range_of<i32>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = u64");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<u64>(x); },
+ [](auto x) -> u64 { return static_cast<u64>(x); },
+ [](auto t, special_value x) { return is_in_range_of<u64>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = i64");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<i64>(x); },
+ [](auto x) -> i64 { return static_cast<i64>(x); },
+ [](auto t, special_value x) { return is_in_range_of<i64>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = f32");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<f32>(x); },
+ [](auto x) -> f32 { return static_cast<f32>(x); },
+ [](auto t, special_value x) { return is_in_range_of<f32>(x.get<subtype<type_of<decltype(t)>>>()); });
+ s.text = ("target_type = f64");
+ test_function1(
+ test_catogories::all, [](auto x) { return kfr::innercast<f64>(x); },
+ [](auto x) -> f64 { return static_cast<f64>(x); },
+ [](auto t, special_value x) { return is_in_range_of<f64>(x.get<subtype<type_of<decltype(t)>>>()); });
+}
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (C) 2016 D Levin (http://www.kfrlib.com)
+# This file is part of KFR
+#
+# KFR is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# KFR is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with KFR.
+
+
+cmake_minimum_required(VERSION 3.1)
+
+# Binary output directories
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
+
+add_executable(sample_rate_converter sample_rate_converter.cpp)
+target_link_libraries(sample_rate_converter kfr kfr_io use_arch)
+
+add_executable(ebu_test ebu_test.cpp)
+target_link_libraries(ebu_test kfr kfr_io use_arch)
diff --git a/tools/ebu_test.cpp b/tools/ebu_test.cpp
@@ -0,0 +1,120 @@
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016 D Levin
+ * See LICENSE.txt for details
+ */
+
+#include <kfr/base.hpp>
+#include <kfr/dsp.hpp>
+#include <kfr/io.hpp>
+
+using namespace kfr;
+
+int main(int argc, char** argv)
+{
+ if (argc < 3)
+ {
+ println("Usage: ebu_test INPUT_IN_F32_RAW_FORMAT CHANNEL_NUMBER");
+ return 1;
+ }
+
+ // Prepare
+ FILE* f = fopen(argv[1], "rb");
+ const int channel_number = atoi(argv[2]);
+ if (channel_number < 1 || channel_number > 6)
+ {
+ println("Incorrect number of channels");
+ return 1;
+ }
+ fseek(f, 0, SEEK_END);
+ uintmax_t size = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ if (size % (sizeof(float) * channel_number))
+ {
+ println("Incorrect file size");
+ return 1;
+ }
+
+ // Read file
+ const size_t length = size / (sizeof(float) * channel_number);
+ univector<float> interleaved(size / sizeof(float));
+ size_t read_len = fread(interleaved.data(), 1, size, f);
+ if (read_len != size)
+ {
+ println("Can't read file");
+ return 1;
+ }
+
+ // Deinterleave
+ univector<univector<float>> data(channel_number, univector<float>(length));
+ for (size_t ch = 0; ch < channel_number; ++ch)
+ {
+ for (size_t i = 0; i < length; ++i)
+ {
+ data[ch][i] = interleaved[i * channel_number + ch];
+ }
+ }
+
+ std::vector<Speaker> speakers;
+ switch (channel_number)
+ {
+ case 1:
+ speakers = { Speaker::Mono };
+ break;
+ case 2:
+ speakers = { Speaker::Left, Speaker::Right };
+ break;
+ case 3:
+ speakers = { Speaker::Left, Speaker::Right, Speaker::Center };
+ break;
+ case 4:
+ speakers = { Speaker::Left, Speaker::Right, Speaker::LeftSurround, Speaker::RightSurround };
+ break;
+ case 5:
+ speakers = { Speaker::Left, Speaker::Right, Speaker::Center, Speaker::LeftSurround,
+ Speaker::RightSurround };
+ break;
+ case 6:
+ speakers = { Speaker::Left, Speaker::Right, Speaker::Center,
+ Speaker::LeftSurround, Speaker::RightSurround, Speaker::Lfe };
+ break;
+ }
+
+ ebu_r128<float> loudness(48000, speakers);
+
+ float M, S, I, RL, RH;
+ float maxM = -HUGE_VALF, maxS = -HUGE_VALF;
+ for (size_t i = 0; i < length / loudness.packet_size(); i++)
+ {
+ std::vector<univector_ref<float>> channels;
+ for (size_t ch = 0; ch < channel_number; ++ch)
+ {
+ channels.push_back(data[ch].slice(i * loudness.packet_size(), loudness.packet_size()));
+ }
+ loudness.process_packet(channels);
+ loudness.get_values(M, S, I, RL, RH);
+ maxM = std::max(maxM, M);
+ maxS = std::max(maxS, S);
+ }
+
+ {
+ // For file-based measurements, the signal should be followed by at least 1.5 s of silence
+ std::vector<univector_dyn<float>> channels(channel_number,
+ univector_dyn<float>(loudness.packet_size()));
+ for (size_t i = 0; i < 15; ++i)
+ loudness.process_packet(channels);
+ float dummyM, dummyS, dummyI;
+ loudness.get_values(dummyM, dummyS, dummyI, RL, RH);
+ }
+
+ println(argv[1]);
+ println("M = ", M);
+ println("S = ", S);
+ println("I = ", I);
+ println("LRA = ", RH - RL);
+ println("maxM = ", maxM);
+ println("maxS = ", maxS);
+ println();
+
+ return 0;
+}
diff --git a/examples/sample_rate_converter.cpp b/tools/sample_rate_converter.cpp
diff --git a/update-sources.py b/update-sources.py
@@ -7,25 +7,33 @@ import subprocess
import sys
import glob
-path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'include')
+def list_sources(name, searchpath, masks):
+ global cmake
+ path = os.path.join(os.path.dirname(os.path.realpath(__file__)), searchpath)
+ filenames = []
+ for root, dirnames, files in os.walk(path, path):
+ for mask in masks:
+ for filename in fnmatch.filter(files, mask):
+ filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/'))
-masks = ['*.hpp', '*.h', '*.i', '*.inc']
+ cmake += """
+set(
+ """ + name + """
+ """ + "\n ".join(['${PROJECT_SOURCE_DIR}/' + searchpath + '/' + f for f in filenames]) + """
+)
-filenames = []
-for root, dirnames, files in os.walk(path, path):
- for mask in masks:
- for filename in fnmatch.filter(files, mask):
- filenames.append(os.path.relpath(os.path.join(root, filename), path).replace('\\','/'))
+ """
cmake = """
# Auto-generated file. Do not edit
# Use update-sources.py
-
-set(
- KFR_SRC
- """ + "\n ".join(['${PROJECT_SOURCE_DIR}/include/' + f for f in filenames]) + """
-)
"""
+list_sources("KFR_SRC", "include", ['*.hpp', '*.h', '*.i', '*.inc'])
+list_sources("KFR_DFT_SRC", "include/kfr/dft", ['*.cpp'])
+list_sources("KFR_IO_SRC", "include/kfr/io", ['*.cpp'])
+
+list_sources("KFR_UNITTEST_SRC", "tests/unit", ['*.cpp'])
+
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sources.cmake'), "w") as f:
f.write(cmake)