Merge pull request #76 from slarew/updates - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit fc7924baf11234af2f46767174728cea7c5c0796
parent 6c5e112c315e3e5021c4285810a82ef459958ac6
Author: Dan Levin <d.levin256@gmail.com>
Date:   Mon, 24 Feb 2020 16:19:56 +0300

Merge pull request #76 from slarew/updates

Fix std::complex compatibility and several features
Diffstat:
M docs/docs/convolution.md  | 45 +++++++++++++++++++++++++++++++++++++++++++++
M examples/CMakeLists.txt  | 2 ++
A examples/ccv.cpp  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M include/kfr/base/generators.hpp  | 41 +++++++++++++++++++++++++++++++++++++++++
M include/kfr/dft/convolution.hpp  | 38 +++++++++++++++++++++++++++++---------
M include/kfr/dft/fft.hpp  | 6 +++---
M include/kfr/dft/impl/convolution-impl.cpp  | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M include/kfr/dsp/delay.hpp  | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M include/kfr/dsp/fir.hpp  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M include/kfr/dsp/iir_design.hpp  | 22 +++++++++++-----------
A include/kfr/dsp/state_holder.hpp  | 41 +++++++++++++++++++++++++++++++++++++++++
M include/kfr/math/complex_math.hpp  | 33 +++++++++++++++++++++++++++++++++
M include/kfr/simd/complex.hpp  | 9 +++++----
M tests/base_test.cpp  | 12 ++++++++++++
M tests/dft_test.cpp  | 36 +++++++++++++++++++++++++++++++++++-
M tests/dsp_test.cpp  | 46 +++++++++++++++++++++++++++++++++++++++++++---

16 files changed, 701 insertions(+), 116 deletions(-)
diff --git a/docs/docs/convolution.md b/docs/docs/convolution.md
@@ -68,3 +68,48 @@ reverb_RR.apply(tmp4, audio[1]);
 audio[0] = tmp1 + tmp2;
 audio[1] = tmp3 + tmp4;
 ```
+
+# Implementation Details
+
+The convolution filter efficiently computes the convolution of two signals.
+The efficiency is achieved by employing the FFT and the circular convolution
+theorem.  The algorithm is a variant of the [overlap-add
+method](https://en.wikipedia.org/wiki/Overlap%E2%80%93add_method).  It works on
+a fixed block size \(B\) for arbitrarily long input signals.  Thus, the
+convolution of a streaming input signal with a long FIR filter \(h[n]\) (where
+the length of \(h[n]\) may exceed the block size \(B\)) is computed with a
+fixed complexity \(O(B \log B)\).
+
+More formally, the convolution filter computes \(y[n] = (x * h)[n]\) by
+partitioning the input \(x\) and filter \(h\) into blocks and applies the
+overlap-add method. Let \(x[n]\) be an input signal of arbitrary length. Often,
+\(x[n]\) is a streaming input with unknown length.  Let \(h[n]\) be an FIR
+filter with \(M\) taps.  The convolution filter works on a fixed block size
+\(B=2^b\).
+
+First, the input and filter are windowed and shifted to the origin to give the
+\(k\)-th block input \(x_k[n] = x[n + kB] , n=\{0,1,\ldots,B-1\},\forall
+k\in\mathbb{Z}\) and \(j\)-th block filter \(h_j[n] = h[n + jB] ,
+n=\{0,1,\ldots,B-1\},j=\{0,1,\ldots,\lfloor M/B \rfloor\}\).  The convolution
+\(y_{k,j}[n] = (x_k * h_j)[n]\) is efficiently computed with length \(2B\) FFTs
+as
+\[
+y_{k,j}[n] = \mathrm{IFFT}(\mathrm{FFT}(x_k[n])\cdot\mathrm{FFT}(h_j[n]))
+.
+\]
+
+The overlap-add method sums the "overlap" from the previous block with the current block.
+To complete the \(k\)-th block, the contribution of all blocks of the filter
+are summed together to give
+\[ y_{k}[n] = \sum_j y_{k-j,j}[n] . \]
+The final convolution is then the sum of the shifted blocks
+\[ y[n] = \sum_k y_{k}[n - kB] . \]
+Note that \(y_k[n]\) is of length \(2B\) so its second half overlaps and adds
+into the first half of the \(y_{k+1}[n]\) block.
+
+## Maximum efficiency criterion
+
+To avoid excess computation or maximize throughput, the convolution filter
+should be given input samples in multiples of the block size \(B\).  Otherwise,
+the FFT of a block is computed twice as many times as would be necessary and
+hence throughput is reduced.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -50,4 +50,6 @@ if (ENABLE_DFT)
         target_link_libraries(dft kfr_multidft)
         target_compile_definitions(dft PRIVATE -DKFR_DFT_MULTI=1)
     endif ()
+    add_executable(ccv ccv.cpp)
+    target_link_libraries(ccv kfr kfr_dft use_arch)
 endif ()
diff --git a/examples/ccv.cpp b/examples/ccv.cpp
@@ -0,0 +1,71 @@
+/*
+ * ccv, part of KFR (https://www.kfr.dev)
+ * Copyright (C) 2019  D Levin
+ * See LICENSE.txt for details
+ */
+
+// Complex convolution filter examples
+
+#define CMT_BASETYPE_F32
+
+#include <chrono>
+#include <kfr/base.hpp>
+#include <kfr/dft.hpp>
+#include <kfr/dsp.hpp>
+
+using namespace kfr;
+
+int main()
+{
+    println(library_version());
+
+    // low-pass filter
+    univector<fbase, 1023> taps127;
+    expression_pointer<fbase> kaiser = to_pointer(window_kaiser(taps127.size(), 3.0));
+    fir_lowpass(taps127, 0.2, kaiser, true);
+
+    // Create filters.
+    size_t const block_size = 256;
+    convolve_filter<complex<fbase>> conv_filter_complex(univector<complex<fbase>>(make_complex(taps127, zeros())),
+                                                block_size);
+    convolve_filter<fbase> conv_filter_real(taps127, block_size);
+
+    // Create noise to filter.
+    auto const size = 1024 * 100 + 33; // not a multiple of block_size
+    univector<complex<fbase>> cnoise =
+        make_complex(truncate(gen_random_range(random_bit_generator{ 1, 2, 3, 4 }, -1.f, +1.f), size),
+                     truncate(gen_random_range(random_bit_generator{ 3, 4, 9, 8 }, -1.f, +1.f), size));
+    univector<fbase> noise =
+                     truncate(gen_random_range(random_bit_generator{ 3, 4, 9, 8 }, -1.f, +1.f), size);
+
+    // Filter results.
+    univector<complex<fbase>> filtered_cnoise_ccv(size), filtered_cnoise_fir(size);
+    univector<fbase> filtered_noise_ccv(size), filtered_noise_fir(size);
+
+    // Complex filtering (time and compare).
+    auto tic = std::chrono::high_resolution_clock::now();
+    conv_filter_complex.apply(filtered_cnoise_ccv, cnoise);
+    auto toc           = std::chrono::high_resolution_clock::now();
+    auto const ccv_time_complex      = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
+    tic                = toc;
+    filtered_cnoise_fir = kfr::fir(cnoise, taps127);
+    toc                = std::chrono::high_resolution_clock::now();
+    auto const fir_time_complex      = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
+    auto const cdiff = rms(cabs(filtered_cnoise_fir - filtered_cnoise_ccv));
+
+    // Real filtering (time and compare).
+    tic = std::chrono::high_resolution_clock::now();
+    conv_filter_real.apply(filtered_noise_ccv, noise);
+    toc           = std::chrono::high_resolution_clock::now();
+    auto const ccv_time_real      = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
+    tic                = toc;
+    filtered_noise_fir = kfr::fir(noise, taps127);
+    toc                = std::chrono::high_resolution_clock::now();
+    auto const fir_time_real      = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
+    auto const diff = rms(filtered_noise_fir - filtered_noise_ccv);
+
+    println("complex: convolution_filter ", ccv_time_complex.count(), " fir ", fir_time_complex.count(), " diff=", cdiff);
+    println("real: convolution_filter ", ccv_time_real.count(), " fir ", fir_time_real.count(), " diff=", diff);
+
+    return 0;
+}
diff --git a/include/kfr/base/generators.hpp b/include/kfr/base/generators.hpp
@@ -136,6 +136,35 @@ protected:
     T vstep;
 };
 
+template <typename T, size_t width = vector_width<T>* bitness_const(1, 2),
+          std::enable_if_t<std::is_same<complex<deep_subtype<T>>, T>::value, int> = 0>
+struct generator_expj : generator<T, width, generator_expj<T, width>>
+{
+    using ST = deep_subtype<T>;
+
+    generator_expj(ST start_, ST step_)
+        : step(step_), alpha(2 * sqr(sin(width * step / 2))), beta(-sin(width * step))
+    {
+        this->resync(T(start_));
+    }
+    KFR_MEM_INTRINSIC void sync(T start) const CMT_NOEXCEPT { this->value = init_cossin(step, start.real()); }
+
+    KFR_MEM_INTRINSIC void next() const CMT_NOEXCEPT
+    {
+        this->value = ccomp(cdecom(this->value) -
+                            subadd(alpha * cdecom(this->value), beta * swap<2>(cdecom(this->value))));
+    }
+
+protected:
+    ST const step;
+    ST const alpha;
+    ST const beta;
+    CMT_NOINLINE static vec<T, width> init_cossin(ST w, ST phase)
+    {
+        return ccomp(cossin(dup(phase + enumerate<ST, width>() * w)));
+    }
+};
+
 template <typename T, size_t width = vector_width<T>* bitness_const(1, 2)>
 struct generator_exp2 : generator<T, width, generator_exp2<T, width>>
 {
@@ -255,6 +284,18 @@ KFR_FUNCTION internal::generator_exp<TF> gen_exp(T1 start, T2 step)
 /**
  * @brief Returns template expression that generates values using the following formula:
  * \f[
+    x_i = e^{ j ( start + i \cdot step ) }
+   \f]
+ */
+template <typename T1, typename T2, typename TF = complex<ftype<common_type<T1, T2>>>>
+KFR_FUNCTION internal::generator_expj<TF> gen_expj(T1 start, T2 step)
+{
+    return internal::generator_expj<TF>(start, step);
+}
+
+/**
+ * @brief Returns template expression that generates values using the following formula:
+ * \f[
     x_i = 2^{ start + i \cdot step }
    \f]
  */
diff --git a/include/kfr/dft/convolution.hpp b/include/kfr/dft/convolution.hpp
@@ -84,6 +84,9 @@ public:
     explicit convolve_filter(size_t size, size_t block_size = 1024);
     explicit convolve_filter(const univector_ref<const T>& data, size_t block_size = 1024);
     void set_data(const univector_ref<const T>& data);
+    void reset() final;
+    /// Apply filter to multiples of returned block size for optimal processing efficiency.
+    size_t input_block_size() const { return block_size; }
 
 protected:
     void process_expression(T* dest, const expression_pointer<T>& src, size_t size) final
@@ -93,19 +96,36 @@ protected:
     }
     void process_buffer(T* output, const T* input, size_t size) final;
 
-    const size_t size;
+    using ST                       = subtype<T>;
+    static constexpr auto real_fft = !std::is_same<T, complex<ST>>::value;
+    using plan_t                   = std::conditional_t<real_fft, dft_plan_real<T>, dft_plan<ST>>;
+
+    // Length of filter data.
+    size_t data_size;
+    // Size of block to process.
     const size_t block_size;
-    const dft_plan_real<T> fft;
+    // FFT plan for circular convolution.
+    const plan_t fft;
+    // Temp storage for FFT.
     univector<u8> temp;
-    std::vector<univector<complex<T>>> segments;
-    std::vector<univector<complex<T>>> ir_segments;
-    size_t input_position;
+    // History of input segments after fwd DFT.  History is circular relative to position below.
+    std::vector<univector<complex<ST>>> segments;
+    // Index into segments of current block.
+    size_t position;
+    // Blocks of filter/data after fwd DFT.
+    std::vector<univector<complex<ST>>> ir_segments;
+    // Saved input for current block.
     univector<T> saved_input;
-    univector<complex<T>> premul;
-    univector<complex<T>> cscratch;
-    univector<T> scratch;
+    // Index into saved_input for next input to begin.
+    size_t input_position;
+    // Pre-multiplied products of input history and delayed filter blocks.
+    univector<complex<ST>> premul;
+    // Scratch buffer for product of filter and input for processing by reverse DFT.
+    univector<complex<ST>> cscratch;
+    // Scratch buffers for input and output of fwd and rev DFTs.
+    univector<T> scratch1, scratch2;
+    // Overlap saved from previous block to add into current block.
     univector<T> overlap;
-    size_t position;
 };
 } // namespace CMT_ARCH_NAME
 
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -430,12 +430,12 @@ struct dct_plan : dft_plan<T>
             dft_plan<T>::execute(mirrored_dft.data(), mirrored.data(), temp, cfalse);
             for (size_t i = 0; i < halfSize; i++)
             {
-                out[i * 2 + 0] = mirrored_dft[i].re;
-                out[i * 2 + 1] = mirrored_dft[size - 1 - i].re;
+                out[i * 2 + 0] = mirrored_dft[i].real();
+                out[i * 2 + 1] = mirrored_dft[size - 1 - i].real();
             }
             if (size % 2)
             {
-                out[size - 1] = mirrored_dft[halfSize].re;
+                out[size - 1] = mirrored_dft[halfSize].real();
             }
         }
     }
diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp
@@ -36,37 +36,39 @@ namespace intrinsics
 template <typename T>
 univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2)
 {
-    const size_t size                = next_poweroftwo(src1.size() + src2.size() - 1);
-    univector<complex<T>> src1padded = src1;
-    univector<complex<T>> src2padded = src2;
-    src1padded.resize(size, 0);
-    src2padded.resize(size, 0);
-
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
+    using ST                          = subtype<T>;
+    const size_t size                 = next_poweroftwo(src1.size() + src2.size() - 1);
+    univector<complex<ST>> src1padded = src1;
+    univector<complex<ST>> src2padded = src2;
+    src1padded.resize(size);
+    src2padded.resize(size);
+
+    dft_plan_ptr<ST> dft = dft_cache::instance().get(ctype_t<ST>(), size);
     univector<u8> temp(dft->temp_size);
     dft->execute(src1padded, src1padded, temp);
     dft->execute(src2padded, src2padded, temp);
     src1padded = src1padded * src2padded;
     dft->execute(src1padded, src1padded, temp, true);
-    const T invsize = reciprocal<T>(size);
+    const ST invsize = reciprocal<ST>(size);
     return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
 }
 
 template <typename T>
 univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2)
 {
-    const size_t size                = next_poweroftwo(src1.size() + src2.size() - 1);
-    univector<complex<T>> src1padded = src1;
-    univector<complex<T>> src2padded = reverse(src2);
-    src1padded.resize(size, 0);
-    src2padded.resize(size, 0);
-    dft_plan_ptr<T> dft = dft_cache::instance().get(ctype_t<T>(), size);
+    using ST                          = subtype<T>;
+    const size_t size                 = next_poweroftwo(src1.size() + src2.size() - 1);
+    univector<complex<ST>> src1padded = src1;
+    univector<complex<ST>> src2padded = reverse(src2);
+    src1padded.resize(size);
+    src2padded.resize(size);
+    dft_plan_ptr<ST> dft = dft_cache::instance().get(ctype_t<ST>(), size);
     univector<u8> temp(dft->temp_size);
     dft->execute(src1padded, src1padded, temp);
     dft->execute(src2padded, src2padded, temp);
     src1padded = src1padded * src2padded;
     dft->execute(src1padded, src1padded, temp, true);
-    const T invsize = reciprocal<T>(size);
+    const ST invsize = reciprocal<ST>(size);
     return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize;
 }
 
@@ -80,21 +82,55 @@ univector<T> autocorrelate(const univector_ref<const T>& src1)
 
 } // namespace intrinsics
 
+// Create a helper template struct to handle the differences between real and complex FFT.
+template <typename T, typename ST = subtype<T>,
+          typename plan_t =
+              std::conditional_t<std::is_same<T, complex<ST>>::value, dft_plan<ST>, dft_plan_real<T>>>
+struct convolve_filter_fft
+{
+    static plan_t make(size_t size);
+    static inline void ifft(plan_t const& plan, univector<T>& out, const univector<complex<T>>& in,
+                            univector<u8>& temp);
+    static size_t csize(plan_t const& plan);
+};
+// Partial template specializations for complex and real cases:
+template <typename ST>
+struct convolve_filter_fft<complex<ST>, ST, dft_plan<ST>>
+{
+    static dft_plan<ST> make(size_t size) { return dft_plan<ST>(size); }
+    static inline void ifft(dft_plan<ST> const& plan, univector<complex<ST>>& out,
+                            const univector<complex<ST>>& in, univector<u8>& temp)
+    {
+        plan.execute(out, in, temp, ctrue);
+    }
+    static size_t csize(dft_plan<ST> const& plan) { return plan.size; }
+};
 template <typename T>
-convolve_filter<T>::convolve_filter(size_t size, size_t block_size)
-    : size(size), block_size(block_size), fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm),
-      temp(fft.temp_size), segments((size + block_size - 1) / block_size)
+struct convolve_filter_fft<T, T, dft_plan_real<T>>
+{
+    static dft_plan_real<T> make(size_t size) { return dft_plan_real<T>(size, dft_pack_format::Perm); }
+    static inline void ifft(dft_plan_real<T> const& plan, univector<T>& out, const univector<complex<T>>& in,
+                            univector<u8>& temp)
+    {
+        plan.execute(out, in, temp);
+    }
+    static size_t csize(dft_plan_real<T> const& plan) { return plan.size / 2; }
+};
 
+template <typename T>
+convolve_filter<T>::convolve_filter(size_t size_, size_t block_size_)
+    : data_size(size_), block_size(next_poweroftwo(block_size_)),
+      fft(convolve_filter_fft<T>::make(2 * block_size)), temp(fft.temp_size),
+      segments((data_size + block_size - 1) / block_size), ir_segments(segments.size()), input_position(0),
+      saved_input(block_size), premul(convolve_filter_fft<T>::csize(fft)),
+      cscratch(convolve_filter_fft<T>::csize(fft)), scratch1(fft.size), scratch2(fft.size),
+      overlap(block_size), position(0)
 {
 }
 
 template <typename T>
-convolve_filter<T>::convolve_filter(const univector_ref<const T>& data, size_t block_size)
-    : size(data.size()), block_size(next_poweroftwo(block_size)),
-      fft(2 * next_poweroftwo(block_size), dft_pack_format::Perm), temp(fft.temp_size),
-      segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
-      ir_segments((data.size() + next_poweroftwo(block_size) - 1) / next_poweroftwo(block_size)),
-      input_position(0), position(0)
+convolve_filter<T>::convolve_filter(const univector_ref<const T>& data, size_t block_size_)
+    : convolve_filter(data.size(), block_size_)
 {
     set_data(data);
 }
@@ -102,65 +138,125 @@ convolve_filter<T>::convolve_filter(const univector_ref<const T>& data, size_t b
 template <typename T>
 void convolve_filter<T>::set_data(const univector_ref<const T>& data)
 {
+    data_size = data.size();
+    segments.resize((data_size + block_size - 1) / block_size);
+    ir_segments.resize(segments.size());
     univector<T> input(fft.size);
-    const T ifftsize = reciprocal(T(fft.size));
+    const ST ifftsize = reciprocal(ST(fft.size));
     for (size_t i = 0; i < ir_segments.size(); i++)
     {
-        segments[i].resize(block_size);
-        ir_segments[i].resize(block_size, 0);
+        segments[i].resize(convolve_filter_fft<T>::csize(fft));
+        ir_segments[i].resize(convolve_filter_fft<T>::csize(fft));
         input = padded(data.slice(i * block_size, block_size));
 
         fft.execute(ir_segments[i], input, temp);
         process(ir_segments[i], ir_segments[i] * ifftsize);
     }
-    saved_input.resize(block_size, 0);
-    scratch.resize(block_size * 2);
-    premul.resize(block_size, 0);
-    cscratch.resize(block_size);
-    overlap.resize(block_size, 0);
+    reset();
 }
 
 template <typename T>
 void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size)
 {
+    // Note that the conditionals in the following algorithm are meant to
+    // reduce complexity in the common cases of either processing complete
+    // blocks (processing == block_size) or only one segment.
+
+    // For complex filtering, use CCs pack format to omit special processing in fft_multiply[_accumulate].
+    static constexpr auto fft_multiply_pack = real_fft ? dft_pack_format::Perm : dft_pack_format::CCs;
+
     size_t processed = 0;
     while (processed < size)
     {
-        const size_t processing = std::min(size - processed, block_size - input_position);
-        builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T));
+        // Calculate how many samples to process this iteration.
+        auto const processing = std::min(size - processed, block_size - input_position);
+
+        // Prepare input to forward FFT:
+        if (processing == block_size)
+        {
+            // No need to work with saved_input.
+            builtin_memcpy(scratch1.data(), input + processed, processing * sizeof(T));
+        }
+        else
+        {
+            // Append this iteration's input to the saved_input current block.
+            builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T));
+            builtin_memcpy(scratch1.data(), saved_input.data(), block_size * sizeof(T));
+        }
 
-        process(scratch, padded(saved_input));
-        fft.execute(segments[position], scratch, temp);
+        // Forward FFT saved_input block.
+        fft.execute(segments[position], scratch1, temp);
 
-        if (input_position == 0)
+        if (segments.size() == 1)
         {
-            process(premul, zeros());
-            for (size_t i = 1; i < segments.size(); i++)
+            // Just one segment/block of history.
+            // Y_k = H * X_k
+            fft_multiply(cscratch, ir_segments[0], segments[0], fft_multiply_pack);
+        }
+        else
+        {
+            // More than one segment/block of history so this is more involved.
+            if (input_position == 0)
             {
-                const size_t n = (position + i) % segments.size();
-                fft_multiply_accumulate(premul, ir_segments[i], segments[n], dft_pack_format::Perm);
+                // At the start of an input block, we premultiply the history from
+                // previous input blocks with the extended filter blocks.
+
+                // Y_(k-i,i) = H_i * X_(k-i)
+                // premul += Y_(k-i,i) for i=1,...,N
+
+                fft_multiply(premul, ir_segments[1], segments[(position + 1) % segments.size()],
+                             fft_multiply_pack);
+                for (size_t i = 2; i < segments.size(); i++)
+                {
+                    const size_t n = (position + i) % segments.size();
+                    fft_multiply_accumulate(premul, ir_segments[i], segments[n], fft_multiply_pack);
+                }
             }
+            // Y_(k,0) = H_0 * X_k
+            // Y_k = premul + Y_(k,0)
+            fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], fft_multiply_pack);
         }
-        fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], dft_pack_format::Perm);
-
-        fft.execute(scratch, cscratch, temp);
+        // y_k = IFFT( Y_k )
+        convolve_filter_fft<T>::ifft(fft, scratch2, cscratch, temp);
 
+        // z_k = y_k + overlap
         process(make_univector(output + processed, processing),
-                scratch.slice(input_position) + overlap.slice(input_position));
+                scratch2.slice(input_position) + overlap.slice(input_position));
 
         input_position += processing;
+        processed += processing;
+
+        // If a whole block was processed, prepare for next block.
         if (input_position == block_size)
         {
+            // Input block k is complete. Move to (k+1)-th input block.
             input_position = 0;
-            process(saved_input, zeros());
 
-            builtin_memcpy(overlap.data(), scratch.data() + block_size, block_size * sizeof(T));
+            // Zero out the saved_input if it will be used in the next iteration.
+            auto const remaining = size - processed;
+            if (remaining < block_size && remaining > 0)
+            {
+                process(saved_input, zeros());
+            }
+
+            builtin_memcpy(overlap.data(), scratch2.data() + block_size, block_size * sizeof(T));
 
             position = position > 0 ? position - 1 : segments.size() - 1;
         }
+    }
+}
 
-        processed += processing;
+template <typename T>
+void convolve_filter<T>::reset()
+{
+    for (auto& segment : segments)
+    {
+        process(segment, zeros());
     }
+    position = 0;
+    process(saved_input, zeros());
+    input_position = 0;
+    process(overlap, zeros());
 }
 
 namespace intrinsics
@@ -168,40 +264,68 @@ namespace intrinsics
 
 template univector<float> convolve<float>(const univector_ref<const float>&,
                                           const univector_ref<const float>&);
+template univector<complex<float>> convolve<complex<float>>(const univector_ref<const complex<float>>&,
+                                                            const univector_ref<const complex<float>>&);
 template univector<float> correlate<float>(const univector_ref<const float>&,
                                            const univector_ref<const float>&);
+template univector<complex<float>> correlate<complex<float>>(const univector_ref<const complex<float>>&,
+                                                             const univector_ref<const complex<float>>&);
 
 template univector<float> autocorrelate<float>(const univector_ref<const float>&);
+template univector<complex<float>> autocorrelate<complex<float>>(const univector_ref<const complex<float>>&);
 
 } // namespace intrinsics
 
 template convolve_filter<float>::convolve_filter(size_t, size_t);
+template convolve_filter<complex<float>>::convolve_filter(size_t, size_t);
 
 template convolve_filter<float>::convolve_filter(const univector_ref<const float>&, size_t);
+template convolve_filter<complex<float>>::convolve_filter(const univector_ref<const complex<float>>&, size_t);
 
 template void convolve_filter<float>::set_data(const univector_ref<const float>&);
+template void convolve_filter<complex<float>>::set_data(const univector_ref<const complex<float>>&);
 
 template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size);
+template void convolve_filter<complex<float>>::process_buffer(complex<float>* output,
+                                                              const complex<float>* input, size_t size);
+
+template void convolve_filter<float>::reset();
+template void convolve_filter<complex<float>>::reset();
 
 namespace intrinsics
 {
 
 template univector<double> convolve<double>(const univector_ref<const double>&,
                                             const univector_ref<const double>&);
+template univector<complex<double>> convolve<complex<double>>(const univector_ref<const complex<double>>&,
+                                                              const univector_ref<const complex<double>>&);
 template univector<double> correlate<double>(const univector_ref<const double>&,
                                              const univector_ref<const double>&);
+template univector<complex<double>> correlate<complex<double>>(const univector_ref<const complex<double>>&,
+                                                               const univector_ref<const complex<double>>&);
 
 template univector<double> autocorrelate<double>(const univector_ref<const double>&);
+template univector<complex<double>> autocorrelate<complex<double>>(
+    const univector_ref<const complex<double>>&);
 
 } // namespace intrinsics
 
 template convolve_filter<double>::convolve_filter(size_t, size_t);
+template convolve_filter<complex<double>>::convolve_filter(size_t, size_t);
 
 template convolve_filter<double>::convolve_filter(const univector_ref<const double>&, size_t);
+template convolve_filter<complex<double>>::convolve_filter(const univector_ref<const complex<double>>&,
+                                                           size_t);
 
 template void convolve_filter<double>::set_data(const univector_ref<const double>&);
+template void convolve_filter<complex<double>>::set_data(const univector_ref<const complex<double>>&);
 
 template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size);
+template void convolve_filter<complex<double>>::process_buffer(complex<double>* output,
+                                                               const complex<double>* input, size_t size);
+
+template void convolve_filter<double>::reset();
+template void convolve_filter<complex<double>>::reset();
 
 template <typename T>
 filter<T>* make_convolve_filter(const univector_ref<const T>& taps, size_t block_size)
@@ -210,7 +334,9 @@ filter<T>* make_convolve_filter(const univector_ref<const T>& taps, size_t block
 }
 
 template filter<float>* make_convolve_filter(const univector_ref<const float>&, size_t);
+template filter<complex<float>>* make_convolve_filter(const univector_ref<const complex<float>>&, size_t);
 template filter<double>* make_convolve_filter(const univector_ref<const double>&, size_t);
+template filter<complex<double>>* make_convolve_filter(const univector_ref<const complex<double>>&, size_t);
 
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/delay.hpp b/include/kfr/dsp/delay.hpp
@@ -25,42 +25,73 @@
  */
 #pragma once
 
+#include "../base/basic_expressions.hpp"
 #include "../base/expression.hpp"
 #include "../base/univector.hpp"
+#include "state_holder.hpp"
 
 namespace kfr
 {
 inline namespace CMT_ARCH_NAME
 {
 
+template <typename T, size_t samples, univector_tag Tag = samples>
+struct delay_state
+{
+    template <size_t S2 = samples, KFR_ENABLE_IF(S2 == Tag)>
+    delay_state() : data({ 0 }), cursor(0)
+    {
+    }
+
+    template <size_t S2 = samples, KFR_ENABLE_IF(S2 != Tag)>
+    delay_state() : data(samples), cursor(0)
+    {
+    }
+
+    mutable univector<T, Tag> data;
+    mutable size_t cursor;
+};
+
+template <typename T>
+struct delay_state<T, 1, 1>
+{
+    mutable T data = T(0);
+};
+
 namespace internal
 {
-template <size_t delay, typename E>
+
+template <size_t delay, typename E, bool stateless, univector_tag STag>
 struct expression_delay : expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
     using T          = value_type;
     using expression_with_arguments<E>::expression_with_arguments;
 
+    expression_delay(E&& e, const delay_state<T, delay, STag>& state)
+        : expression_with_arguments<E>(std::forward<E>(e)), state(state)
+    {
+    }
+
     template <size_t N, KFR_ENABLE_IF(N <= delay)>
     friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
                                                 vec_shape<T, N>)
     {
         vec<T, N> out;
-        size_t c = self.cursor;
-        self.data.ringbuf_read(c, out);
+        size_t c = self.state.s.cursor;
+        self.state.s.data.ringbuf_read(c, out);
         const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
-        self.data.ringbuf_write(self.cursor, in);
+        self.state.s.data.ringbuf_write(self.state.s.cursor, in);
         return out;
     }
     friend vec<T, 1> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
                                   vec_shape<T, 1>)
     {
         T out;
-        size_t c = self.cursor;
-        self.data.ringbuf_read(c, out);
+        size_t c = self.state.s.cursor;
+        self.state.s.data.ringbuf_read(c, out);
         const T in = self.argument_first(cinput, index, vec_shape<T, 1>())[0];
-        self.data.ringbuf_write(self.cursor, in);
+        self.state.s.data.ringbuf_write(self.state.s.cursor, in);
         return out;
     }
     template <size_t N, KFR_ENABLE_IF(N > delay)>
@@ -68,34 +99,38 @@ struct expression_delay : expression_with_arguments<E>
                                   vec_shape<T, N>)
     {
         vec<T, delay> out;
-        size_t c = self.cursor;
-        self.data.ringbuf_read(c, out);
+        size_t c = self.state.s.cursor;
+        self.state.s.data.ringbuf_read(c, out);
         const vec<T, N> in = self.argument_first(cinput, index, vec_shape<T, N>());
-        self.data.ringbuf_write(self.cursor, slice<N - delay, delay>(in));
+        self.state.s.data.ringbuf_write(self.state.s.cursor, slice<N - delay, delay>(in));
         return concat_and_slice<0, N>(out, in);
     }
 
-    mutable univector<value_type, delay> data = scalar(value_type(0));
-    mutable size_t cursor                     = 0;
+    state_holder<delay_state<T, delay, STag>, stateless> state;
 };
 
-template <typename E>
-struct expression_delay<1, E> : expression_with_arguments<E>
+template <typename E, bool stateless, univector_tag STag>
+struct expression_delay<1, E, stateless, STag> : expression_with_arguments<E>
 {
     using value_type = value_type_of<E>;
     using T          = value_type;
     using expression_with_arguments<E>::expression_with_arguments;
 
+    expression_delay(E&& e, const delay_state<T, 1, STag>& state)
+        : expression_with_arguments<E>(std::forward<E>(e)), state(state)
+    {
+    }
+
     template <size_t N>
     friend KFR_INTRINSIC vec<T, N> get_elements(const expression_delay& self, cinput_t cinput, size_t index,
                                                 vec_shape<T, N>)
     {
         const vec<T, N> in  = self.argument_first(cinput, index, vec_shape<T, N>());
-        const vec<T, N> out = insertleft(self.data, in);
-        self.data           = in[N - 1];
+        const vec<T, N> out = insertleft(self.state.s.data, in);
+        self.state.s.data   = in[N - 1];
         return out;
     }
-    mutable value_type data = value_type(0);
+    state_holder<delay_state<T, 1, STag>, stateless> state;
 };
 } // namespace internal
 
@@ -108,11 +143,30 @@ struct expression_delay<1, E> : expression_with_arguments<E>
  * auto d = delay(v, csize<4>);
  * @endcode
  */
-template <size_t samples = 1, typename E1>
-KFR_INTRINSIC internal::expression_delay<samples, E1> delay(E1&& e1, csize_t<samples> = csize_t<samples>())
+template <size_t samples = 1, typename E1, typename T = value_type_of<E1>>
+KFR_INTRINSIC internal::expression_delay<samples, E1, false, samples> delay(E1&& e1)
 {
     static_assert(samples >= 1 && samples < 1024, "");
-    return internal::expression_delay<samples, E1>(std::forward<E1>(e1));
+    return internal::expression_delay<samples, E1, false, samples>(std::forward<E1>(e1),
+                                                                   delay_state<T, samples>());
+}
+
+/**
+ * @brief Returns template expression that applies delay to the input (uses ring buffer in state)
+ * @param state delay filter state
+ * @param e1 an input expression
+ * @code
+ * univector<double, 10> v = counter();
+ * delay_state<double, 4> state;
+ * auto d = delay(state, v);
+ * @endcode
+ */
+template <size_t samples, typename T, typename E1, univector_tag STag>
+KFR_INTRINSIC internal::expression_delay<samples, E1, true, STag> delay(delay_state<T, samples, STag>& state,
+                                                                        E1&& e1)
+{
+    static_assert(STag == tag_dynamic_vector || (samples >= 1 && samples < 1024), "");
+    return internal::expression_delay<samples, E1, true, STag>(std::forward<E1>(e1), state);
 }
 } // namespace CMT_ARCH_NAME
 } // namespace kfr
diff --git a/include/kfr/dsp/fir.hpp b/include/kfr/dsp/fir.hpp
@@ -31,6 +31,7 @@
 #include "../base/reduce.hpp"
 #include "../base/univector.hpp"
 #include "../simd/vec.hpp"
+#include "state_holder.hpp"
 
 namespace kfr
 {
@@ -70,28 +71,23 @@ struct fir_state
     mutable size_t delayline_cursor;
 };
 
-namespace internal
+template <typename U, univector_tag Tag = tag_dynamic_vector>
+struct moving_sum_state
 {
-
-template <typename T, bool stateless>
-struct state_holder
+    moving_sum_state() : delayline({ 0 }), head_cursor(0), tail_cursor(1) {}
+    mutable univector<U, Tag> delayline;
+    mutable size_t head_cursor, tail_cursor;
+};
+template <typename U>
+struct moving_sum_state<U, tag_dynamic_vector>
 {
-    state_holder()                    = delete;
-    state_holder(const state_holder&) = default;
-    state_holder(state_holder&&)      = default;
-    constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
-    T s;
+    moving_sum_state(size_t sum_length) : delayline(sum_length, U(0)), head_cursor(0), tail_cursor(1) {}
+    mutable univector<U> delayline;
+    mutable size_t head_cursor, tail_cursor;
 };
 
-template <typename T>
-struct state_holder<T, true>
+namespace internal
 {
-    state_holder()                    = delete;
-    state_holder(const state_holder&) = default;
-    state_holder(state_holder&&)      = default;
-    constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
-    const T& s;
-};
 
 template <size_t tapcount, typename T, typename U, typename E1, bool stateless = false>
 struct expression_short_fir : expression_with_arguments<E1>
@@ -153,6 +149,51 @@ struct expression_fir : expression_with_arguments<E1>
     }
     state_holder<fir_state<T, U>, stateless> state;
 };
+
+template <typename U, typename E1, univector_tag STag, bool stateless = false>
+struct expression_moving_sum : expression_with_arguments<E1>
+{
+    using value_type = U;
+
+    expression_moving_sum(E1&& e1, const moving_sum_state<U, STag>& state)
+        : expression_with_arguments<E1>(std::forward<E1>(e1)), state(state)
+    {
+    }
+
+    template <size_t N>
+    KFR_INTRINSIC friend vec<U, N> get_elements(const expression_moving_sum& self, cinput_t cinput,
+                                                size_t index, vec_shape<U, N> x)
+    {
+        static_assert(N >= 1, "");
+
+        const vec<U, N> input = self.argument_first(cinput, index, x);
+
+        vec<U, N> output;
+        size_t wcursor = self.state.s.head_cursor;
+        size_t rcursor = self.state.s.tail_cursor;
+
+        // initial summation
+        self.state.s.delayline.ringbuf_write(wcursor, input[0]);
+        auto s    = sum(self.state.s.delayline);
+        output[0] = s;
+
+        CMT_LOOP_NOUNROLL
+        for (size_t i = 1; i < N; i++)
+        {
+            U nextout;
+            self.state.s.delayline.ringbuf_read(rcursor, nextout);
+            U const nextin = input[i];
+            self.state.s.delayline.ringbuf_write(wcursor, nextin);
+            s += nextin - nextout;
+            output[i] = s;
+        }
+        self.state.s.delayline.ringbuf_step(rcursor, 1);
+        self.state.s.head_cursor = wcursor;
+        self.state.s.tail_cursor = rcursor;
+        return output;
+    }
+    state_holder<moving_sum_state<U, STag>, stateless> state;
+};
 } // namespace internal
 
 /**
@@ -178,6 +219,30 @@ KFR_INTRINSIC internal::expression_fir<T, U, E1, true> fir(fir_state<T, U>& stat
 }
 
 /**
+ * @brief Returns template expression that performs moving sum on the input
+ * @param state moving sum state
+ * @param e1 an input expression
+ */
+template <size_t sum_length, typename E1>
+KFR_INTRINSIC internal::expression_moving_sum<value_type_of<E1>, E1, tag_dynamic_vector> moving_sum(E1&& e1)
+{
+    return internal::expression_moving_sum<value_type_of<E1>, E1, tag_dynamic_vector>(std::forward<E1>(e1),
+                                                                                      sum_length);
+}
+
+/**
+ * @brief Returns template expression that performs moving sum on the input
+ * @param state moving sum state
+ * @param e1 an input expression
+ */
+template <typename U, typename E1, univector_tag STag>
+KFR_INTRINSIC internal::expression_moving_sum<U, E1, STag, true> moving_sum(moving_sum_state<U, STag>& state,
+                                                                            E1&& e1)
+{
+    return internal::expression_moving_sum<U, E1, STag, true>(std::forward<E1>(e1), state);
+}
+
+/**
  * @brief Returns template expression that applies FIR filter to the input (count of coefficients must be in
  * range 2..32)
  * @param e1 an input expression
diff --git a/include/kfr/dsp/iir_design.hpp b/include/kfr/dsp/iir_design.hpp
@@ -63,7 +63,7 @@ KFR_FUNCTION zpk<T> chebyshev1(int N, identity<T> rp)
 
     univector<T> theta      = c_pi<T> * m / (2 * N);
     univector<complex<T>> p = -csinh(make_complex(mu, theta));
-    T k                     = product(-p).re;
+    T k                     = product(-p).real();
     if (N % 2 == 0)
         k = k / sqrt(1.0 + eps * eps);
     return { {}, std::move(p), k };
@@ -98,7 +98,7 @@ KFR_FUNCTION zpk<T> chebyshev2(int N, identity<T> rs)
     p = make_complex(sinh(mu) * real(p), cosh(mu) * imag(p));
     p = 1.0 / p;
 
-    T k = (product(-p) / product(-z)).re;
+    T k = (product(-p) / product(-z)).real();
 
     return { std::move(z), std::move(p), k };
 }
@@ -868,7 +868,7 @@ KFR_FUNCTION zpk<T> bilinear(const zpk<T>& filter, identity<T> fs)
     result.z = (fs2 + filter.z) / (fs2 - filter.z);
     result.p = (fs2 + filter.p) / (fs2 - filter.p);
     result.z.resize(result.p.size(), complex<T>(-1));
-    result.k = filter.k * real(product(fs2 - filter.z) / product(fs2 - filter.p));
+    result.k = filter.k * kfr::real(product(fs2 - filter.z) / product(fs2 - filter.p));
     return result;
 }
 
@@ -881,7 +881,7 @@ struct zero_pole_pairs
 template <typename T>
 KFR_FUNCTION vec<T, 3> zpk2tf_poly(const complex<T>& x, const complex<T>& y)
 {
-    return { T(1), -(x.re + y.re), x.re * y.re - x.im * y.im };
+    return { T(1), -(x.real() + y.real()), x.real() * y.real() - x.imag() * y.imag() };
 }
 
 template <typename T>
@@ -897,18 +897,18 @@ template <typename T>
 KFR_FUNCTION univector<complex<T>> cplxreal(const univector<complex<T>>& list)
 {
     univector<complex<T>> x = list;
-    std::sort(x.begin(), x.end(), [](const complex<T>& a, const complex<T>& b) { return a.re < b.re; });
+    std::sort(x.begin(), x.end(), [](const complex<T>& a, const complex<T>& b) { return a.real() < b.real(); });
     T tol                        = std::numeric_limits<T>::epsilon() * 100;
     univector<complex<T>> result = x;
     for (size_t i = result.size(); i > 1; i--)
     {
         if (!isreal(result[i - 1]) && !isreal(result[i - 2]))
         {
-            if (abs(result[i - 1].re - result[i - 2].re) < tol &&
-                abs(result[i - 1].im + result[i - 2].im) < tol)
+            if (abs(result[i - 1].real() - result[i - 2].real()) < tol &&
+                abs(result[i - 1].imag() + result[i - 2].imag()) < tol)
             {
                 result.erase(result.begin() + i - 1);
-                result[i - 2].im = abs(result[i - 2].im);
+                result[i - 2].imag(abs(result[i - 2].imag()));
             }
         }
     }
@@ -951,7 +951,7 @@ KFR_FUNCTION int countreal(const univector<complex<T>>& list)
     int nreal = 0;
     for (complex<T> c : list)
     {
-        if (c.im == 0)
+        if (c.imag() == 0)
             nreal++;
     }
     return nreal;
@@ -974,7 +974,7 @@ KFR_FUNCTION zpk<T> lp2hp_zpk(const zpk<T>& filter, identity<T> wo)
     result.z = wo / filter.z;
     result.p = wo / filter.p;
     result.z.resize(result.p.size(), T(0));
-    result.k = filter.k * real(product(-filter.z) / product(-filter.p));
+    result.k = filter.k * kfr::real(product(-filter.z) / product(-filter.p));
     return result;
 }
 
@@ -1012,7 +1012,7 @@ KFR_FUNCTION zpk<T> lp2bs_zpk(const zpk<T>& filter, identity<T> wo, identity<T> 
 
     result.z.resize(result.z.size() + filter.p.size() - filter.z.size(), complex<T>(0, +wo));
     result.z.resize(result.z.size() + filter.p.size() - filter.z.size(), complex<T>(0, -wo));
-    result.k = filter.k * real(product(-filter.z) / product(-filter.p));
+    result.k = filter.k * kfr::real(product(-filter.z) / product(-filter.p));
 
     return result;
 }
diff --git a/include/kfr/dsp/state_holder.hpp b/include/kfr/dsp/state_holder.hpp
@@ -0,0 +1,41 @@
+/** @addtogroup fir
+ *  @{
+ */
+/**
+ * KFR (http://kfrlib.com)
+ * Copyright (C) 2016  D Levin
+ * See LICENSE.txt for details
+ */
+#pragma once
+
+#include "../cident.h"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+namespace internal
+{
+
+template <typename T, bool stateless>
+struct state_holder
+{
+    state_holder()                    = delete;
+    state_holder(const state_holder&) = default;
+    state_holder(state_holder&&)      = default;
+    constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
+    T s;
+};
+
+template <typename T>
+struct state_holder<T, true>
+{
+    state_holder()                    = delete;
+    state_holder(const state_holder&) = default;
+    state_holder(state_holder&&)      = default;
+    constexpr state_holder(const T& state) CMT_NOEXCEPT : s(state) {}
+    const T& s;
+};
+} // namespace internal
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/math/complex_math.hpp b/include/kfr/math/complex_math.hpp
@@ -64,6 +64,12 @@ KFR_INTRINSIC vec<complex<T>, N> ccosh(const vec<complex<T>, N>& x)
 }
 
 template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabssqr(const vec<complex<T>, N>& x)
+{
+    const vec<T, N* 2> xx = sqr(cdecom(x));
+    return even(xx) + odd(xx);
+}
+template <typename T, size_t N>
 KFR_INTRINSIC vec<T, N> cabs(const vec<complex<T>, N>& x)
 {
     const vec<T, N* 2> xx = sqr(cdecom(x));
@@ -158,6 +164,11 @@ KFR_HANDLE_SCALAR(csqrt)
 KFR_HANDLE_SCALAR(csqr)
 
 template <typename T, size_t N>
+KFR_INTRINSIC vec<T, N> cabssqr(const vec<T, N>& a)
+{
+    return to_scalar(intrinsics::cabssqr(static_cast<vec<complex<T>, N>>(a)));
+}
+template <typename T, size_t N>
 KFR_INTRINSIC vec<T, N> cabs(const vec<T, N>& a)
 {
     return to_scalar(intrinsics::cabs(static_cast<vec<complex<T>, N>>(a)));
@@ -168,6 +179,12 @@ KFR_INTRINSIC vec<T, N> carg(const vec<T, N>& a)
     return to_scalar(intrinsics::carg(static_cast<vec<complex<T>, N>>(a)));
 }
 template <typename T1>
+KFR_INTRINSIC realtype<T1> cabssqr(const T1& a)
+{
+    using vecout = vec1<T1>;
+    return to_scalar(intrinsics::cabssqr(vecout(a)));
+}
+template <typename T1>
 KFR_INTRINSIC realtype<T1> cabs(const T1& a)
 {
     using vecout = vec1<T1>;
@@ -185,6 +202,7 @@ KFR_I_FN(csin)
 KFR_I_FN(csinh)
 KFR_I_FN(ccos)
 KFR_I_FN(ccosh)
+KFR_I_FN(cabssqr)
 KFR_I_FN(cabs)
 KFR_I_FN(carg)
 KFR_I_FN(clog)
@@ -254,6 +272,21 @@ KFR_FUNCTION internal::expression_function<fn::ccosh, E1> ccosh(E1&& x)
     return { fn::ccosh(), std::forward<E1>(x) };
 }
 
+/// @brief Returns the squared absolute value (magnitude squared) of the complex number x
+template <typename T1, KFR_ENABLE_IF(is_numeric<T1>)>
+KFR_FUNCTION realtype<T1> cabssqr(const T1& x)
+{
+    return intrinsics::cabssqr(x);
+}
+
+/// @brief Returns template expression that returns the squared absolute value (magnitude squared) of the
+/// complex number x
+template <typename E1, KFR_ENABLE_IF(is_input_expression<E1>)>
+KFR_FUNCTION internal::expression_function<fn::cabssqr, E1> cabssqr(E1&& x)
+{
+    return { fn::cabssqr(), std::forward<E1>(x) };
+}
+
 /// @brief Returns the absolute value (magnitude) of the complex number x
 template <typename T1, KFR_ENABLE_IF(is_numeric<T1>)>
 KFR_FUNCTION realtype<T1> cabs(const T1& x)
diff --git a/include/kfr/simd/complex.hpp b/include/kfr/simd/complex.hpp
@@ -60,13 +60,13 @@ struct complex
     constexpr complex(const complex&) CMT_NOEXCEPT = default;
     constexpr complex(complex&&) CMT_NOEXCEPT      = default;
     template <typename U>
-    KFR_MEM_INTRINSIC constexpr complex(const complex<U>& other) CMT_NOEXCEPT : re(static_cast<T>(other.re)),
-                                                                                im(static_cast<T>(other.im))
+    KFR_MEM_INTRINSIC constexpr complex(const complex<U>& other) CMT_NOEXCEPT : re(static_cast<T>(other.real())),
+                                                                                im(static_cast<T>(other.imag()))
     {
     }
     template <typename U>
-    KFR_MEM_INTRINSIC constexpr complex(complex<U>&& other) CMT_NOEXCEPT : re(std::move(other.re)),
-                                                                           im(std::move(other.im))
+    KFR_MEM_INTRINSIC constexpr complex(complex<U>&& other) CMT_NOEXCEPT : re(std::move(other.real())),
+                                                                           im(std::move(other.imag()))
     {
     }
 #ifdef CMT_COMPILER_GNU
@@ -80,6 +80,7 @@ struct complex
     KFR_MEM_INTRINSIC constexpr const T& imag() const CMT_NOEXCEPT { return im; }
     KFR_MEM_INTRINSIC constexpr void real(T value) CMT_NOEXCEPT { re = value; }
     KFR_MEM_INTRINSIC constexpr void imag(T value) CMT_NOEXCEPT { im = value; }
+private:
     T re;
     T im;
 };
diff --git a/tests/base_test.cpp b/tests/base_test.cpp
@@ -92,6 +92,18 @@ TEST(test_basic)
     CHECK(inrange(pack(1, 2, 3), 1, 1) == make_mask<int>(true, false, false));
 }
 
+TEST(test_gen_expj)
+{
+    kfr::univector<cbase> v = kfr::truncate(kfr::gen_expj(0.f, constants<float>::pi_s(2) * 0.1f), 1000);
+    CHECK(rms(cabs(v.slice(990) -
+                   univector<cbase>({ cbase(1., +0.00000000e+00), cbase(0.80901699, +5.87785252e-01),
+                                      cbase(0.30901699, +9.51056516e-01), cbase(-0.30901699, +9.51056516e-01),
+                                      cbase(-0.80901699, +5.87785252e-01), cbase(-1., +1.22464680e-16),
+                                      cbase(-0.80901699, -5.87785252e-01),
+                                      cbase(-0.30901699, -9.51056516e-01), cbase(0.30901699, -9.51056516e-01),
+                                      cbase(0.80901699, -5.87785252e-01) }))) < 0.00001);
+}
+
 } // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp
@@ -32,7 +32,17 @@ TEST(test_convolve)
     CHECK(rms(c - univector<fbase>({ 0.25, 1., 2.75, 2.5, 3.75, 3.5, 1.5, -4., 7.5 })) < 0.0001);
 }
 
-TEST(test_fft_convolve)
+TEST(test_complex_convolve)
+{
+    univector<complex<fbase>, 5> a({ 1, 2, 3, 4, 5 });
+    univector<complex<fbase>, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
+    univector<complex<fbase>> c = convolve(a, b);
+    CHECK(c.size() == 9u);
+    CHECK(rms(cabs(c - univector<complex<fbase>>({ 0.25, 1., 2.75, 2.5, 3.75, 3.5, 1.5, -4., 7.5 }))) <
+          0.0001);
+}
+
+TEST(test_convolve_filter)
 {
     univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
     univector<fbase, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
@@ -42,6 +52,21 @@ TEST(test_fft_convolve)
     CHECK(rms(dest - univector<fbase>({ 0.25, 1., 2.75, 2.5, 3.75 })) < 0.0001);
 }
 
+TEST(test_complex_convolve_filter)
+{
+    univector<complex<fbase>, 5> a({ 1, 2, 3, 4, 5 });
+    univector<complex<fbase>, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
+    univector<complex<fbase>, 5> dest;
+    convolve_filter<complex<fbase>> filter(a);
+    filter.apply(dest, b);
+    CHECK(rms(cabs(dest - univector<complex<fbase>>({ 0.25, 1., 2.75, 2.5, 3.75 }))) < 0.0001);
+    filter.apply(dest, b);
+    CHECK(rms(cabs(dest - univector<complex<fbase>>({ 0.25, 1., 2.75, 2.5, 3.75 }))) > 0.0001);
+    filter.reset();
+    filter.apply(dest, b);
+    CHECK(rms(cabs(dest - univector<complex<fbase>>({ 0.25, 1., 2.75, 2.5, 3.75 }))) < 0.0001);
+}
+
 TEST(test_correlate)
 {
     univector<fbase, 5> a({ 1, 2, 3, 4, 5 });
@@ -51,6 +76,15 @@ TEST(test_correlate)
     CHECK(rms(c - univector<fbase>({ 1.5, 1., 1.5, 2.5, 3.75, -4., 7.75, 3.5, 1.25 })) < 0.0001);
 }
 
+TEST(test_complex_correlate)
+{
+    univector<complex<fbase>, 5> a({ 1, 2, 3, 4, 5 });
+    univector<complex<fbase>, 5> b({ 0.25, 0.5, 1.0, -2.0, 1.5 });
+    univector<complex<fbase>> c = correlate(a, b);
+    CHECK(c.size() == 9u);
+    CHECK(rms(cabs(c - univector<fbase>({ 1.5, 1., 1.5, 2.5, 3.75, -4., 7.75, 3.5, 1.25 }))) < 0.0001);
+}
+
 #if defined CMT_ARCH_ARM || !defined NDEBUG
 constexpr size_t fft_stopsize = 12;
 constexpr size_t dft_stopsize = 101;
diff --git a/tests/dsp_test.cpp b/tests/dsp_test.cpp
@@ -287,6 +287,12 @@ TEST(delay)
     CHECK_EXPRESSION(delay(v1), 33, [](size_t i) { return i < 1 ? 0.f : (i - 1) + 100.f; });
 
     CHECK_EXPRESSION(delay<3>(v1), 33, [](size_t i) { return i < 3 ? 0.f : (i - 3) + 100.f; });
+
+    delay_state<float, 3> state1;
+    CHECK_EXPRESSION(delay(state1, v1), 33, [](size_t i) { return i < 3 ? 0.f : (i - 3) + 100.f; });
+
+    delay_state<float, 3, tag_dynamic_vector> state2;
+    CHECK_EXPRESSION(delay(state2, v1), 33, [](size_t i) { return i < 3 ? 0.f : (i - 3) + 100.f; });
 }
 
 TEST(fracdelay)
@@ -396,12 +402,46 @@ TEST(fir)
                           return result;
                       });
 
+                      fir_state state(taps.ref());
+
+                      CHECK_EXPRESSION(fir(state, data), 100, [&](size_t index) -> T {
+                          T result = 0;
+                          for (size_t i = 0; i < taps.size(); i++)
+                              result += data.get(index - i, 0) * taps[i];
+                          return result;
+                      });
+
                       CHECK_EXPRESSION(short_fir(data, taps), 100, [&](size_t index) -> T {
                           T result = 0;
                           for (size_t i = 0; i < taps.size(); i++)
                               result += data.get(index - i, 0) * taps[i];
                           return result;
                       });
+
+                      CHECK_EXPRESSION(moving_sum<taps.size()>(data), 100, [&](size_t index) -> T {
+                          T result = 0;
+                          for (size_t i = 0; i < taps.size(); i++)
+                              result += data.get(index - i, 0);
+                          return result;
+                      });
+
+                      moving_sum_state<T, 131> msstate1;
+
+                      CHECK_EXPRESSION(moving_sum(msstate1, data), 100, [&](size_t index) -> T {
+                          T result = 0;
+                          for (size_t i = 0; i < msstate1.delayline.size(); i++)
+                              result += data.get(index - i, 0);
+                          return result;
+                      });
+
+                      moving_sum_state<T> msstate2(133);
+
+                      CHECK_EXPRESSION(moving_sum(msstate2, data), 100, [&](size_t index) -> T {
+                          T result = 0;
+                          for (size_t i = 0; i < msstate2.delayline.size(); i++)
+                              result += data.get(index - i, 0);
+                          return result;
+                      });
                   });
 #endif
 }
@@ -444,7 +484,7 @@ inline std::complex<T> from_std(const std::complex<T>& c)
 template <typename T>
 inline std::complex<T> to_std(const kfr::complex<T>& c)
 {
-    return { c.re, c.im };
+    return { c.real(), c.imag() };
 }
 
 template <typename T>
@@ -584,9 +624,9 @@ TEST(resampler_test)
 } // namespace CMT_ARCH_NAME
 
 #ifndef KFR_NO_MAIN
-int main()
+int main(int argc, char* argv[])
 {
     println(library_version());
-    return testo::run_all("", true);
+    return testo::run_all(argc > 1 ? argv[1] : "", true);
 }
 #endif

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	docs/docs/convolution.md	\|	45	+++++++++++++++++++++++++++++++++++++++++++++
M	examples/CMakeLists.txt	\|	2	++
A	examples/ccv.cpp	\|	71	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	include/kfr/base/generators.hpp	\|	41	+++++++++++++++++++++++++++++++++++++++++
M	include/kfr/dft/convolution.hpp	\|	38	+++++++++++++++++++++++++++++---------
M	include/kfr/dft/fft.hpp	\|	6	+++---
M	include/kfr/dft/impl/convolution-impl.cpp	\|	222	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	include/kfr/dsp/delay.hpp	\|	94	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	include/kfr/dsp/fir.hpp	\|	99	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	include/kfr/dsp/iir_design.hpp	\|	22	+++++++++++-----------
A	include/kfr/dsp/state_holder.hpp	\|	41	+++++++++++++++++++++++++++++++++++++++++
M	include/kfr/math/complex_math.hpp	\|	33	+++++++++++++++++++++++++++++++++
M	include/kfr/simd/complex.hpp	\|	9	+++++----
M	tests/base_test.cpp	\|	12	++++++++++++
M	tests/dft_test.cpp	\|	36	+++++++++++++++++++++++++++++++++++-
M	tests/dsp_test.cpp	\|	46	+++++++++++++++++++++++++++++++++++++++++++---