From e244e31614a23e7f02eed1358ba91abf78e1b490 Mon Sep 17 00:00:00 2001
From: Geraint <luffgd@gmail.com>
Date: Fri, 25 Nov 2022 13:45:55 +0000
Subject: [PATCH] Initial commit: pitch-shift works OK, time-stretch could be
 better

---
 dsp/LICENSE.txt       |  21 ++
 dsp/README.md         |  40 +++
 dsp/common.h          |  40 +++
 dsp/curves.h          | 234 ++++++++++++++
 dsp/delay.h           | 716 ++++++++++++++++++++++++++++++++++++++++++
 dsp/envelopes.h       | 523 ++++++++++++++++++++++++++++++
 dsp/fft.h             | 520 ++++++++++++++++++++++++++++++
 dsp/filters.h         | 434 +++++++++++++++++++++++++
 dsp/mix.h             | 218 +++++++++++++
 dsp/perf.h            |  43 +++
 dsp/spectral.h        | 455 +++++++++++++++++++++++++++
 dsp/windows.h         | 185 +++++++++++
 signalsmith-stretch.h | 412 ++++++++++++++++++++++++
 13 files changed, 3841 insertions(+)
 create mode 100644 dsp/LICENSE.txt
 create mode 100644 dsp/README.md
 create mode 100644 dsp/common.h
 create mode 100644 dsp/curves.h
 create mode 100644 dsp/delay.h
 create mode 100644 dsp/envelopes.h
 create mode 100644 dsp/fft.h
 create mode 100644 dsp/filters.h
 create mode 100644 dsp/mix.h
 create mode 100644 dsp/perf.h
 create mode 100644 dsp/spectral.h
 create mode 100644 dsp/windows.h
 create mode 100644 signalsmith-stretch.h
diff --git a/dsp/LICENSE.txt b/dsp/LICENSE.txt
new file mode 100644
index 0000000..4e91ac9
--- /dev/null
+++ b/dsp/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Geraint Luff / Signalsmith Audio Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/dsp/README.md b/dsp/README.md
new file mode 100644
index 0000000..d0c2840
--- /dev/null
+++ b/dsp/README.md
@@ -0,0 +1,40 @@
+# Signalsmith Audio's DSP Library
+
+A C++11 header-only library, providing classes/templates for (mostly audio) signal-processing tasks.
+
+More detail is in the [main project page](https://signalsmith-audio.co.uk/code/dsp/), and the [Doxygen docs](https://signalsmith-audio.co.uk/code/dsp/html/modules.html).
+
+## Basic use
+
+```
+git clone https://signalsmith-audio.co.uk/code/dsp.git
+```
+
+Just include the header file(s) you need, and start using classes:
+
+```cpp
+#include "dsp/delay.h"
+
+using Delay = signalsmith::delay::Delay<float>;
+Delay delayLine(1024);
+```
+
+You can add a compile-time version-check to make sure you have a compatible version of the library:
+```cpp
+#include "dsp/envelopes.h"
+SIGNALSMITH_DSP_VERSION_CHECK(1, 3, 3)
+```
+
+### Development / contributing
+
+Tests (and source-scripts for the above docs) are available in a separate repo:
+
+```
+git clone https://signalsmith-audio.co.uk/code/dsp-doc.git
+```
+
+The goal (where possible) is to measure/test the actual audio characteristics of the tools (e.g. frequency responses and aliasing levels).
+
+### License
+
+This code is [MIT licensed](LICENSE.txt).  If you'd prefer something else, get in touch.
diff --git a/dsp/common.h b/dsp/common.h
new file mode 100644
index 0000000..a1963e6
--- /dev/null
+++ b/dsp/common.h
@@ -0,0 +1,40 @@
+#ifndef SIGNALSMITH_DSP_COMMON_H
+#define SIGNALSMITH_DSP_COMMON_H
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846264338327950288
+#endif
+
+namespace signalsmith {
+	/**	@defgroup Common Common
+		@brief Definitions and helper classes used by the rest of the library
+		
+		@{
+		@file
+	*/
+
+#define SIGNALSMITH_DSP_VERSION_MAJOR 1
+#define SIGNALSMITH_DSP_VERSION_MINOR 3
+#define SIGNALSMITH_DSP_VERSION_PATCH 3
+#define SIGNALSMITH_DSP_VERSION_STRING "1.3.3"
+
+	/** Version compatability check.
+	\code{.cpp}
+		static_assert(signalsmith::version(1, 0, 0), "version check");
+	\endcode
+	... or use the equivalent `SIGNALSMITH_DSP_VERSION_CHECK`.
+	Major versions are not compatible with each other.  Minor and patch versions are backwards-compatible.
+	*/
+	constexpr bool versionCheck(int major, int minor, int patch=0) {
+		return major == SIGNALSMITH_DSP_VERSION_MAJOR
+			&& (SIGNALSMITH_DSP_VERSION_MINOR > minor
+				|| (SIGNALSMITH_DSP_VERSION_MINOR == minor && SIGNALSMITH_DSP_VERSION_PATCH >= patch));
+	}
+
+/// Check the library version is compatible (semver).
+#define SIGNALSMITH_DSP_VERSION_CHECK(major, minor, patch) \
+	static_assert(::signalsmith::versionCheck(major, minor, patch), "signalsmith library version is " SIGNALSMITH_DSP_VERSION_STRING);
+
+/** @} */
+} // signalsmith::
+#endif // include guard
diff --git a/dsp/curves.h b/dsp/curves.h
new file mode 100644
index 0000000..934dd29
--- /dev/null
+++ b/dsp/curves.h
@@ -0,0 +1,234 @@
+#ifndef SIGNALSMITH_DSP_CURVES_H
+#define SIGNALSMITH_DSP_CURVES_H
+
+#include "./common.h"
+
+#include <vector>
+#include <algorithm> // std::stable_sort
+
+namespace signalsmith {
+namespace curves {
+	/**	@defgroup Curves Curves
+		@brief User-defined mapping functions
+		
+		@{
+		@file
+	*/
+
+	/// Linear map for real values.
+	template<typename Sample=double>
+	class Linear {
+		Sample a1, a0;
+	public:
+		Linear() : Linear(0, 1) {}
+		Linear(Sample a0, Sample a1) : a1(a1), a0(a0) {}
+		/// Construct by from/to value pairs
+		Linear(Sample x0, Sample x1, Sample y0, Sample y1) : a1((x0 == x1) ? 0 : (y1 - y0)/(x1 - x0)), a0(y0 - x0*a1) {}
+
+		Sample operator ()(Sample x) const {
+			return a0 + x*a1;
+		}
+		
+		/// Returns the inverse map (with some numerical error)
+		Linear inverse() const {
+			Sample invA1 = 1/a1;
+			return Linear(-a0*invA1, invA1);
+		}
+	};
+
+	/// A real-valued cubic curve.  It has a "start" point where accuracy is highest.
+	template<typename Sample=double>
+	class Cubic {
+		Sample xStart, a0, a1, a2, a3;
+		
+		// Only use with y0 != y1
+		static inline Sample gradient(Sample x0, Sample x1, Sample y0, Sample y1) {
+			return (y1 - y0)/(x1 - x0);
+		}
+		// Ensure a gradient produces monotonic segments
+		static inline void ensureMonotonic(Sample &curveGrad, Sample gradA, Sample gradB) {
+			if ((gradA <= 0 && gradB >= 0) || (gradA >= 0 && gradB <= 0)) {
+				curveGrad = 0; // point is a local minimum/maximum
+			} else {
+				if (std::abs(curveGrad) > std::abs(gradA*3)) {
+					curveGrad = gradA*3;
+				}
+				if (std::abs(curveGrad) > std::abs(gradB*3)) {
+					curveGrad = gradB*3;
+				}
+			}
+		}
+		// When we have duplicate x-values (either side) make up a gradient
+		static inline void chooseGradient(Sample &curveGrad, Sample grad1, Sample curveGradOther, Sample y0, Sample y1, bool monotonic) {
+			curveGrad = 2*grad1 - curveGradOther;
+			if (y0 != y1 && (y1 > y0) != (grad1 >= 0)) { // not duplicate y, but a local min/max
+				curveGrad = 0;
+			} else if (monotonic) {
+				if (grad1 >= 0) {
+					curveGrad = std::max<Sample>(0, curveGrad);
+				} else {
+					curveGrad = std::min<Sample>(0, curveGrad);
+				}
+			}
+		}
+	public:
+		Cubic() : Cubic(0, 0, 0, 0, 0) {}
+		Cubic(Sample xStart, Sample a0, Sample a1, Sample a2, Sample a3) : xStart(xStart), a0(a0), a1(a1), a2(a2), a3(a3) {}
+		
+		Sample operator ()(Sample x) const {
+			x -= xStart;
+			return a0 + x*(a1 + x*(a2 + x*a3));
+		}
+		/// The reference x-value, used as the centre of the cubic expansion
+		Sample start() const {
+			return xStart;
+		}
+		/// Differentiate
+		Cubic dx() const {
+			return {xStart, a1, 2*a2, 3*a3, 0};
+		}
+		
+		/// Cubic segment based on start/end values and gradients
+		static Cubic hermite(Sample x0, Sample x1, Sample y0, Sample y1, Sample g0, Sample g1) {
+			Sample xScale = 1/(x1 - x0);
+			return {
+				x0, y0, g0,
+				(3*(y1 - y0)*xScale - 2*g0 - g1)*xScale,
+				(2*(y0 - y1)*xScale + g0 + g1)*(xScale*xScale)
+			};
+		}
+		
+		/** Cubic segment (valid between `x1` and `x2`), which is smooth when applied to an adjacent set of points.
+		If `x0 == x1` or `x2 == x3` it will choose a gradient which continues in a quadratic curve, or 0 if the point is a local minimum/maximum.
+		*/
+		static Cubic smooth(Sample x0, Sample x1, Sample x2, Sample x3, Sample y0, Sample y1, Sample y2, Sample y3, bool monotonic=false) {
+			if (x1 == x2) return {0, y1, 0, 0, 0}; // zero-width segment, just return constant
+
+			Sample grad1 = gradient(x1, x2, y1, y2);
+			Sample curveGrad1 = grad1;
+			if (x0 != x1) { // we have a defined x0-x1 gradient
+				Sample grad0 = gradient(x0, x1, y0, y1);
+				curveGrad1 = (grad0 + grad1)*Sample(0.5);
+				if (monotonic) ensureMonotonic(curveGrad1, grad0, grad1);
+			} else if (y0 != y1) {
+				if ((y1 > y0) != (grad1 >= 0)) curveGrad1 = 0; // set to 0 if it's a min/max
+			}
+			Sample curveGrad2;
+			if (x2 != x3) { // we have a defined x1-x2 gradient
+				Sample grad2 = gradient(x2, x3, y2, y3);
+				curveGrad2 = (grad1 + grad2)*Sample(0.5);
+				if (monotonic) ensureMonotonic(curveGrad2, grad1, grad2);
+
+				if (x0 == x1) { // If the other gradient isn't defined, make one up
+					chooseGradient(curveGrad1, grad1, curveGrad2, y0, y1, monotonic);
+				}
+			} else {
+				chooseGradient(curveGrad2, grad1, curveGrad1, y2, y3, monotonic);
+			}
+			return hermite(x1, x2, y1, y2, curveGrad1, curveGrad2);
+		}
+	};
+	
+	/** Smooth interpolation (optionally monotonic) between points, using cubic segments.
+	\diagram{cubic-segments-example.svg,Example curve including a repeated point and an instantaneous jump.  The curve is flat beyond the first/last points.}
+	To produce a sharp corner, use a repeated point. The gradient is flat at the edges, unless you use repeated points at the start/end.*/
+	template<typename Sample=double>
+	class CubicSegmentCurve {
+		struct Point {
+			Sample x, y;
+			bool operator <(const Point &other) const {
+				return x < other.x;
+			}
+		};
+		std::vector<Point> points;
+		Point first{0, 0}, last{0, 0};
+		
+		std::vector<Cubic<Sample>> _segments{1};
+	public:
+		/// Clear existing points and segments
+		void clear() {
+			points.resize(0);
+			_segments.resize(0);
+			first = last = {0, 0};
+		}
+	
+		/// Add a new point, but does not recalculate the segments.  `corner` just writes the point twice, for convenience.
+		CubicSegmentCurve & add(Sample x, Sample y, bool corner=false) {
+			points.push_back({x, y});
+			if (corner) points.push_back({x, y});
+			return *this;
+		}
+		
+		/// Recalculates the segments.
+		void update(bool monotonic=false) {
+			if (points.empty()) add(0, 0);
+			std::stable_sort(points.begin(), points.end()); // Ensure ascending order
+			_segments.resize(0);
+			first = points[0];
+			last = points.back();
+			for (size_t i = 1; i < points.size(); ++i) {
+				Point p1 = points[i - 1];
+				Point p2 = points[i];
+				if (p1.x != p2.x) {
+					Point p0 = (i > 1) ? points[i - 2] : Point{p1.x, p2.y};
+					Point p3 = (i + 1 < points.size()) ? points[i + 1] : Point{p2.x, p1.y};
+					_segments.push_back(Segment::smooth(p0.x, p1.x, p2.x, p3.x, p0.y, p1.y, p2.y, p3.y, monotonic));
+				}
+			}
+		}
+		
+		/// Reads a value out from the curve.
+		Sample operator ()(Sample x) const {
+			if (x <= first.x) return first.y;
+			if (x >= last.x) return last.y;
+			size_t index = 1;
+			while (index < _segments.size() && _segments[index].start() <= x) {
+				++index;
+			}
+			return _segments[index - 1](x);
+		}
+		
+		using Segment = Cubic<Sample>;
+		const std::vector<Segment> & segments() const {
+			return _segments;
+		}
+	};
+	
+	/** A warped-range map, based on 1/x
+		\diagram{curves-reciprocal-example.svg}*/
+	template<typename Sample=double>
+	class Reciprocal {
+		Sample a, b, c, d; // (a + bx)/(c + dx)
+		Reciprocal(Sample a, Sample b, Sample c, Sample d) : a(a), b(b), c(c), d(d) {}
+	public:
+		Reciprocal() : Reciprocal(0, 0.5, 1) {}
+		/// If no x-range given, default to the unit range
+		Reciprocal(Sample y0, Sample y1, Sample y2) : Reciprocal(0, 0.5, 1, y0, y1, y2) {}
+		Reciprocal(Sample x0, Sample x1, Sample x2, Sample y0, Sample y1, Sample y2) {
+			Sample kx = (x1 - x0)/(x2 - x1);
+			Sample ky = (y1 - y0)/(y2 - y1);
+			a = (kx*x2)*y0 - (ky*x0)*y2;
+			b = ky*y2 - kx*y0;
+			c = kx*x2 - ky*x0;
+			d = ky - kx;
+		}
+
+		Sample operator ()(double x) const {
+			return (a + b*x)/(c + d*x);
+		}
+		Reciprocal inverse() const {
+			return Reciprocal(-a, c, b, -d);
+		}
+		Sample inverse(Sample y) const {
+			return (c*y - a)/(b - d*y);
+		}
+		
+		/// Combine two `Reciprocal`s together in sequence
+		Reciprocal then(const Reciprocal &other) const {
+			return Reciprocal(other.a*c + other.b*a, other.a*d + other.b*b, other.c*c + other.d*a, other.c*d + other.d*b);
+		}
+	};
+
+/** @} */
+}} // namespace
+#endif // include guard
diff --git a/dsp/delay.h b/dsp/delay.h
new file mode 100644
index 0000000..f52113f
--- /dev/null
+++ b/dsp/delay.h
@@ -0,0 +1,716 @@
+#ifndef SIGNALSMITH_DSP_DELAY_H
+#define SIGNALSMITH_DSP_DELAY_H
+
+#include "./common.h"
+
+#include <vector>
+#include <array>
+#include <cmath> // for std::ceil()
+#include <type_traits>
+
+#include <complex>
+#include "./fft.h"
+#include "./windows.h"
+
+namespace signalsmith {
+namespace delay {
+	/**	@defgroup Delay Delay utilities
+		@brief Standalone templated classes for delays
+		
+		You can set up a `Buffer` or `MultiBuffer`, and get interpolated samples using a `Reader` (separately on each channel in the multi-channel case) - or you can use `Delay`/`MultiDelay` which include their own buffers.
+
+		Interpolation quality is chosen using a template class, from @ref Interpolators.
+
+		@{
+		@file
+	*/
+
+	/** @brief Single-channel delay buffer
+ 
+		Access is used with `buffer[]`, relative to the internal read/write position ("head").  This head is moved using `++buffer` (or `buffer += n`), such that `buffer[1] == (buffer + 1)[0]` in a similar way iterators/pointers.
+		
+		Operations like `buffer - 10` or `buffer++` return a View, which holds a fixed position in the buffer (based on the read/write position at the time).
+		
+		The capacity includes both positive and negative indices.  For example, a capacity of 100 would support using any of the ranges:
+		
+		* `buffer[-99]` to buffer[0]`
+		* `buffer[-50]` to buffer[49]`
+		* `buffer[0]` to buffer[99]`
+
+		Although buffers are usually used with historical samples accessed using negative indices e.g. `buffer[-10]`, you could equally use it flipped around (moving the head backwards through the buffer using `--buffer`).
+	*/
+	template<typename Sample>
+	class Buffer {
+		unsigned bufferIndex;
+		unsigned bufferMask;
+		std::vector<Sample> buffer;
+	public:
+		Buffer(int minCapacity=0) {
+			resize(minCapacity);
+		}
+		// We shouldn't accidentally copy a delay buffer
+		Buffer(const Buffer &other) = delete;
+		Buffer & operator =(const Buffer &other) = delete;
+		// But moving one is fine
+		Buffer(Buffer &&other) = default;
+		Buffer & operator =(Buffer &&other) = default;
+
+		void resize(int minCapacity, Sample value=Sample()) {
+			int bufferLength = 1;
+			while (bufferLength < minCapacity) bufferLength *= 2;
+			buffer.assign(bufferLength, value);
+			bufferMask = unsigned(bufferLength - 1);
+			bufferIndex = 0;
+		}
+		void reset(Sample value=Sample()) {
+			buffer.assign(buffer.size(), value);
+		}
+
+		/// Holds a view for a particular position in the buffer
+		template<bool isConst>
+		class View {
+			using CBuffer = typename std::conditional<isConst, const Buffer, Buffer>::type;
+			using CSample = typename std::conditional<isConst, const Sample, Sample>::type;
+			CBuffer *buffer = nullptr;
+			unsigned bufferIndex = 0;
+		public:
+			View(CBuffer &buffer, int offset=0) : buffer(&buffer), bufferIndex(buffer.bufferIndex + (unsigned)offset) {}
+			View(const View &other, int offset=0) : buffer(other.buffer), bufferIndex(other.bufferIndex + (unsigned)offset) {}
+			View & operator =(const View &other) {
+				buffer = other.buffer;
+				bufferIndex = other.bufferIndex;
+				return *this;
+			}
+			
+			CSample & operator[](int offset) {
+				return buffer->buffer[(bufferIndex + (unsigned)offset)&buffer->bufferMask];
+			}
+			const Sample & operator[](int offset) const {
+				return buffer->buffer[(bufferIndex + (unsigned)offset)&buffer->bufferMask];
+			}
+
+			/// Write data into the buffer
+			template<typename Data>
+			void write(Data &&data, int length) {
+				for (int i = 0; i < length; ++i) {
+					(*this)[i] = data[i];
+				}
+			}
+			/// Read data out from the buffer
+			template<typename Data>
+			void read(int length, Data &&data) const {
+				for (int i = 0; i < length; ++i) {
+					data[i] = (*this)[i];
+				}
+			}
+
+			View operator +(int offset) const {
+				return View(*this, offset);
+			}
+			View operator -(int offset) const {
+				return View(*this, -offset);
+			}
+		};
+		using MutableView = View<false>;
+		using ConstView = View<true>;
+		
+		MutableView view(int offset=0) {
+			return MutableView(*this, offset);
+		}
+		ConstView view(int offset=0) const {
+			return ConstView(*this, offset);
+		}
+		ConstView constView(int offset=0) const {
+			return ConstView(*this, offset);
+		}
+
+		Sample & operator[](int offset) {
+			return buffer[(bufferIndex + (unsigned)offset)&bufferMask];
+		}
+		const Sample & operator[](int offset) const {
+			return buffer[(bufferIndex + (unsigned)offset)&bufferMask];
+		}
+
+		/// Write data into the buffer
+		template<typename Data>
+		void write(Data &&data, int length) {
+			for (int i = 0; i < length; ++i) {
+				(*this)[i] = data[i];
+			}
+		}
+		/// Read data out from the buffer
+		template<typename Data>
+		void read(int length, Data &&data) const {
+			for (int i = 0; i < length; ++i) {
+				data[i] = (*this)[i];
+			}
+		}
+		
+		Buffer & operator ++() {
+			++bufferIndex;
+			return *this;
+		}
+		Buffer & operator +=(int i) {
+			bufferIndex += (unsigned)i;
+			return *this;
+		}
+		Buffer & operator --() {
+			--bufferIndex;
+			return *this;
+		}
+		Buffer & operator -=(int i) {
+			bufferIndex -= (unsigned)i;
+			return *this;
+		}
+
+		MutableView operator ++(int) {
+			MutableView view(*this);
+			++bufferIndex;
+			return view;
+		}
+		MutableView operator +(int i) {
+			return MutableView(*this, i);
+		}
+		ConstView operator +(int i) const {
+			return ConstView(*this, i);
+		}
+		MutableView operator --(int) {
+			MutableView view(*this);
+			--bufferIndex;
+			return view;
+		}
+		MutableView operator -(int i) {
+			return MutableView(*this, -i);
+		}
+		ConstView operator -(int i) const {
+			return ConstView(*this, -i);
+		}
+	};
+
+	/** @brief Multi-channel delay buffer
+
+		This behaves similarly to the single-channel `Buffer`, with the following differences:
+		
+		* `buffer[c]` returns a view for a single channel, which behaves like the single-channel `Buffer::View`.
+		* The constructor and `.resize()` take an additional first `channel` argument.
+	*/
+	template<typename Sample>
+	class MultiBuffer {
+		int channels, stride;
+		Buffer<Sample> buffer;
+	public:
+		using ConstChannel = typename Buffer<Sample>::ConstView;
+		using MutableChannel = typename Buffer<Sample>::MutableView;
+
+		MultiBuffer(int channels=0, int capacity=0) : channels(channels), stride(capacity), buffer(channels*capacity) {}
+
+		void resize(int nChannels, int capacity, Sample value=Sample()) {
+			channels = nChannels;
+			stride = capacity;
+			buffer.resize(channels*capacity, value);
+		}
+		void reset(Sample value=Sample()) {
+			buffer.reset(value);
+		}
+
+		/// A reference-like multi-channel result for a particular sample index
+		template<bool isConst>
+		class Stride {
+			using CChannel = typename std::conditional<isConst, ConstChannel, MutableChannel>::type;
+			using CSample = typename std::conditional<isConst, const Sample, Sample>::type;
+			CChannel view;
+			int channels, stride;
+		public:
+			Stride(CChannel view, int channels, int stride) : view(view), channels(channels), stride(stride) {}
+			Stride(const Stride &other) : view(other.view), channels(other.channels), stride(other.stride) {}
+			
+			CSample & operator[](int channel) {
+				return view[channel*stride];
+			}
+			const Sample & operator[](int channel) const {
+				return view[channel*stride];
+			}
+
+			/// Reads from the buffer into a multi-channel result
+			template<class Data>
+			void get(Data &&result) const {
+				for (int c = 0; c < channels; ++c) {
+					result[c] = view[c*stride];
+				}
+			}
+			/// Writes from multi-channel data into the buffer
+			template<class Data>
+			void set(Data &&data) {
+				for (int c = 0; c < channels; ++c) {
+					view[c*stride] = data[c];
+				}
+			}
+			template<class Data>
+			Stride & operator =(const Data &data) {
+				set(data);
+				return *this;
+			}
+			Stride & operator =(const Stride &data) {
+				set(data);
+				return *this;
+			}
+		};
+		
+		Stride<false> at(int offset) {
+			return {buffer.view(offset), channels, stride};
+		}
+		Stride<true> at(int offset) const {
+			return {buffer.view(offset), channels, stride};
+		}
+
+		/// Holds a particular position in the buffer
+		template<bool isConst>
+		class View {
+			using CChannel = typename std::conditional<isConst, ConstChannel, MutableChannel>::type;
+			CChannel view;
+			int channels, stride;
+		public:
+			View(CChannel view, int channels, int stride) : view(view), channels(channels), stride(stride) {}
+			
+			CChannel operator[](int channel) {
+				return view + channel*stride;
+			}
+			ConstChannel operator[](int channel) const {
+				return view + channel*stride;
+			}
+
+			Stride<isConst> at(int offset) {
+				return {view + offset, channels, stride};
+			}
+			Stride<true> at(int offset) const {
+				return {view + offset, channels, stride};
+			}
+		};
+		using MutableView = View<false>;
+		using ConstView = View<true>;
+
+		MutableView view(int offset=0) {
+			return MutableView(buffer.view(offset), channels, stride);
+		}
+		ConstView view(int offset=0) const {
+			return ConstView(buffer.view(offset), channels, stride);
+		}
+		ConstView constView(int offset=0) const {
+			return ConstView(buffer.view(offset), channels, stride);
+		}
+
+		MutableChannel operator[](int channel) {
+			return buffer + channel*stride;
+		}
+		ConstChannel operator[](int channel) const {
+			return buffer + channel*stride;
+		}
+		
+		MultiBuffer & operator ++() {
+			++buffer;
+			return *this;
+		}
+		MultiBuffer & operator +=(int i) {
+			buffer += i;
+			return *this;
+		}
+		MutableView operator ++(int) {
+			return MutableView(buffer++, channels, stride);
+		}
+		MutableView operator +(int i) {
+			return MutableView(buffer + i, channels, stride);
+		}
+		ConstView operator +(int i) const {
+			return ConstView(buffer + i, channels, stride);
+		}
+		MultiBuffer & operator --() {
+			--buffer;
+			return *this;
+		}
+		MultiBuffer & operator -=(int i) {
+			buffer -= i;
+			return *this;
+		}
+		MutableView operator --(int) {
+			return MutableView(buffer--, channels, stride);
+		}
+		MutableView operator -(int i) {
+			return MutableView(buffer - i, channels, stride);
+		}
+		ConstView operator -(int i) const {
+			return ConstView(buffer - i, channels, stride);
+		}
+	};
+	
+	/** \defgroup Interpolators Interpolators
+		\ingroup Delay
+		@{ */
+	/// Nearest-neighbour interpolator
+	/// \diagram{delay-random-access-nearest.svg,aliasing and maximum amplitude/delay errors for different input frequencies}
+	template<typename Sample>
+	struct InterpolatorNearest {
+		static constexpr int inputLength = 1;
+		static constexpr Sample latency = -0.5; // Because we're truncating, which rounds down too often
+	
+		template<class Data>
+		static Sample fractional(const Data &data, Sample) {
+			return data[0];
+		}
+	};
+	/// Linear interpolator
+	/// \diagram{delay-random-access-linear.svg,aliasing and maximum amplitude/delay errors for different input frequencies}
+	template<typename Sample>
+	struct InterpolatorLinear {
+		static constexpr int inputLength = 2;
+		static constexpr int latency = 0;
+	
+		template<class Data>
+		static Sample fractional(const Data &data, Sample fractional) {
+			Sample a = data[0], b = data[1];
+			return a + fractional*(b - a);
+		}
+	};
+	/// Spline cubic interpolator
+	/// \diagram{delay-random-access-cubic.svg,aliasing and maximum amplitude/delay errors for different input frequencies}
+	template<typename Sample>
+	struct InterpolatorCubic {
+		static constexpr int inputLength = 4;
+		static constexpr int latency = 1;
+	
+		template<class Data>
+		static Sample fractional(const Data &data, Sample fractional) {
+			// Cubic interpolation
+			Sample a = data[0], b = data[1], c = data[2], d = data[3];
+			Sample cbDiff = c - b;
+			Sample k1 = (c - a)*0.5;
+			Sample k3 = k1 + (d - b)*0.5 - cbDiff*2;
+			Sample k2 = cbDiff - k3 - k1;
+			return b + fractional*(k1 + fractional*(k2 + fractional*k3)); // 16 ops total, not including the indexing
+		}
+	};
+
+	// Efficient Algorithms and Structures for Fractional Delay Filtering Based on Lagrange Interpolation
+	// Franck 2009 https://www.aes.org/e-lib/browse.cfm?elib=14647
+	namespace _franck_impl {
+		template<typename Sample, int n, int low, int high>
+		struct ProductRange {
+			using Array = std::array<Sample, (n + 1)>;
+			static constexpr int mid = (low + high)/2;
+			using Left = ProductRange<Sample, n, low, mid>;
+			using Right = ProductRange<Sample, n, mid + 1, high>;
+
+			Left left;
+			Right right;
+
+			const Sample total;
+			ProductRange(Sample x) : left(x), right(x), total(left.total*right.total) {}
+
+			template<class Data>
+			Sample calculateResult(Sample extraFactor, const Data &data, const Array &invFactors) {
+				return left.calculateResult(extraFactor*right.total, data, invFactors)
+					+ right.calculateResult(extraFactor*left.total, data, invFactors);
+			}
+		};
+		template<typename Sample, int n, int index>
+		struct ProductRange<Sample, n, index, index> {
+			using Array = std::array<Sample, (n + 1)>;
+
+			const Sample total;
+			ProductRange(Sample x) : total(x - index) {}
+
+			template<class Data>
+			Sample calculateResult(Sample extraFactor, const Data &data, const Array &invFactors) {
+				return extraFactor*data[index]*invFactors[index];
+			}
+		};
+	}
+	/** Fixed-order Lagrange interpolation.
+	\diagram{interpolator-LagrangeN.svg,aliasing and amplitude/delay errors for different sizes}
+	*/
+	template<typename Sample, int n>
+	struct InterpolatorLagrangeN {
+		static constexpr int inputLength = n + 1;
+		static constexpr int latency = (n - 1)/2;
+
+		using Array = std::array<Sample, (n + 1)>;
+		Array invDivisors;
+
+		InterpolatorLagrangeN() {
+			for (int j = 0; j <= n; ++j) {
+				double divisor = 1;
+				for (int k = 0; k < j; ++k) divisor *= (j - k);
+				for (int k = j + 1; k <= n; ++k) divisor *= (j - k);
+				invDivisors[j] = 1/divisor;
+			}
+		}
+
+		template<class Data>
+		Sample fractional(const Data &data, Sample fractional) const {
+			constexpr int mid = n/2;
+			using Left = _franck_impl::ProductRange<Sample, n, 0, mid>;
+			using Right = _franck_impl::ProductRange<Sample, n, mid + 1, n>;
+
+			Sample x = fractional + latency;
+
+			Left left(x);
+			Right right(x);
+
+			return left.calculateResult(right.total, data, invDivisors) + right.calculateResult(left.total, data, invDivisors);
+		}
+	};
+	template<typename Sample>
+	using InterpolatorLagrange3 = InterpolatorLagrangeN<Sample, 3>;
+	template<typename Sample>
+	using InterpolatorLagrange7 = InterpolatorLagrangeN<Sample, 7>;
+	template<typename Sample>
+	using InterpolatorLagrange19 = InterpolatorLagrangeN<Sample, 19>;
+
+	/** Fixed-size Kaiser-windowed sinc interpolation.
+	\diagram{interpolator-KaiserSincN.svg,aliasing and amplitude/delay errors for different sizes}
+	If `minimumPhase` is enabled, a minimum-phase version of the kernel is used:
+	\diagram{interpolator-KaiserSincN-min.svg,aliasing and amplitude/delay errors for minimum-phase mode}
+	*/
+	template<typename Sample, int n, bool minimumPhase=false>
+	struct InterpolatorKaiserSincN {
+		static constexpr int inputLength = n;
+		static constexpr Sample latency = minimumPhase ? 0 : (n*Sample(0.5) - 1);
+
+		int subSampleSteps;
+		std::vector<Sample> coefficients;
+		
+		InterpolatorKaiserSincN() : InterpolatorKaiserSincN(0.5 - 0.45/std::sqrt(n)) {}
+		InterpolatorKaiserSincN(double passFreq) : InterpolatorKaiserSincN(passFreq, 1 - passFreq) {}
+		InterpolatorKaiserSincN(double passFreq, double stopFreq) {
+			subSampleSteps = 2*n; // Heuristic again.  Really it depends on the bandwidth as well.
+			double kaiserBandwidth = (stopFreq - passFreq)*(n + 1.0/subSampleSteps);
+			kaiserBandwidth += 1.25/kaiserBandwidth; // We want to place the first zero, but (because using this to window a sinc essentially integrates it in the freq-domain), our ripples (and therefore zeroes) are out of phase.  This is a heuristic fix.
+			
+			double centreIndex = n*subSampleSteps*0.5, scaleFactor = 1.0/subSampleSteps;
+			std::vector<Sample> windowedSinc(subSampleSteps*n + 1);
+			
+			::signalsmith::windows::Kaiser::withBandwidth(kaiserBandwidth, false).fill(windowedSinc, windowedSinc.size());
+
+			for (size_t i = 0; i < windowedSinc.size(); ++i) {
+				double x = (i - centreIndex)*scaleFactor;
+				int intX = std::round(x);
+				if (intX != 0 && std::abs(x - intX) < 1e-6) {
+					// Exact 0s
+					windowedSinc[i] = 0;
+				} else if (std::abs(x) > 1e-6) {
+					double p = x*M_PI;
+					windowedSinc[i] *= std::sin(p)/p;
+				}
+			}
+			
+			if (minimumPhase) {
+				signalsmith::fft::FFT<Sample> fft(windowedSinc.size()*2, 1);
+				windowedSinc.resize(fft.size(), 0);
+				std::vector<std::complex<Sample>> spectrum(fft.size());
+				std::vector<std::complex<Sample>> cepstrum(fft.size());
+				fft.fft(windowedSinc, spectrum);
+				for (size_t i = 0; i < fft.size(); ++i) {
+					spectrum[i] = std::log(std::abs(spectrum[i]) + 1e-30);
+				}
+				fft.fft(spectrum, cepstrum);
+				for (size_t i = 1; i < fft.size()/2; ++i) {
+					cepstrum[i] *= 0;
+				}
+				for (size_t i = fft.size()/2 + 1; i < fft.size(); ++i) {
+					cepstrum[i] *= 2;
+				}
+				Sample scaling = Sample(1)/fft.size();
+				fft.ifft(cepstrum, spectrum);
+
+				for (size_t i = 0; i < fft.size(); ++i) {
+					Sample phase = spectrum[i].imag()*scaling;
+					Sample mag = std::exp(spectrum[i].real()*scaling);
+					spectrum[i] = {mag*std::cos(phase), mag*std::sin(phase)};
+				}
+				fft.ifft(spectrum, cepstrum);
+				windowedSinc.resize(subSampleSteps*n + 1);
+				windowedSinc.shrink_to_fit();
+				for (size_t i = 0; i < windowedSinc.size(); ++i) {
+					windowedSinc[i] = cepstrum[i].real()*scaling;
+				}
+			}
+			
+			// Re-order into FIR fractional-delay blocks
+			coefficients.resize(n*(subSampleSteps + 1));
+			for (int k = 0; k <= subSampleSteps; ++k) {
+				for (int i = 0; i < n; ++i) {
+					coefficients[k*n + i] = windowedSinc[(subSampleSteps - k) + i*subSampleSteps];
+				}
+			}
+		}
+		
+		template<class Data>
+		Sample fractional(const Data &data, Sample fractional) const {
+			Sample subSampleDelay = fractional*subSampleSteps;
+			int lowIndex = subSampleDelay;
+			if (lowIndex >= subSampleSteps) lowIndex = subSampleSteps - 1;
+			Sample subSampleFractional = subSampleDelay - lowIndex;
+			int highIndex = lowIndex + 1;
+			
+			Sample sumLow = 0, sumHigh = 0;
+			const Sample *coeffLow = coefficients.data() + lowIndex*n;
+			const Sample *coeffHigh = coefficients.data() + highIndex*n;
+			for (int i = 0; i < n; ++i) {
+				sumLow += data[i]*coeffLow[i];
+				sumHigh += data[i]*coeffHigh[i];
+			}
+			return sumLow + (sumHigh - sumLow)*subSampleFractional;
+		}
+	};
+
+	template<typename Sample>
+	using InterpolatorKaiserSinc20 = InterpolatorKaiserSincN<Sample, 20>;
+	template<typename Sample>
+	using InterpolatorKaiserSinc8 = InterpolatorKaiserSincN<Sample, 8>;
+	template<typename Sample>
+	using InterpolatorKaiserSinc4 = InterpolatorKaiserSincN<Sample, 4>;
+
+	template<typename Sample>
+	using InterpolatorKaiserSinc20Min = InterpolatorKaiserSincN<Sample, 20, true>;
+	template<typename Sample>
+	using InterpolatorKaiserSinc8Min = InterpolatorKaiserSincN<Sample, 8, true>;
+	template<typename Sample>
+	using InterpolatorKaiserSinc4Min = InterpolatorKaiserSincN<Sample, 4, true>;
+	///  @}
+	
+	/** @brief A delay-line reader which uses an external buffer
+ 
+		This is useful if you have multiple delay-lines reading from the same buffer.
+	*/
+	template<class Sample, template<typename> class Interpolator=InterpolatorLinear>
+	class Reader : public Interpolator<Sample> /* so we can get the empty-base-class optimisation */ {
+		using Super = Interpolator<Sample>;
+	public:
+		Reader () {}
+		/// Pass in a configured interpolator
+		Reader (const Interpolator<Sample> &interpolator) : Super(interpolator) {}
+	
+		template<typename Buffer>
+		Sample read(const Buffer &buffer, Sample delaySamples) const {
+			int startIndex = delaySamples;
+			Sample remainder = delaySamples - startIndex;
+			
+			// Delay buffers use negative indices, but interpolators use positive ones
+			using View = decltype(buffer - startIndex);
+			struct Flipped {
+				 View view;
+				 Sample operator [](int i) const {
+					return view[-i];
+				 }
+			};
+			return Super::fractional(Flipped{buffer - startIndex}, remainder);
+		}
+	};
+
+	/**	@brief A single-channel delay-line containing its own buffer.*/
+	template<class Sample, template<typename> class Interpolator=InterpolatorLinear>
+	class Delay : private Reader<Sample, Interpolator> {
+		using Super = Reader<Sample, Interpolator>;
+		Buffer<Sample> buffer;
+	public:
+		static constexpr Sample latency = Super::latency;
+
+		Delay(int capacity=0) : buffer(1 + capacity + Super::inputLength) {}
+		/// Pass in a configured interpolator
+		Delay(const Interpolator<Sample> &interp, int capacity=0) : Super(interp), buffer(1 + capacity + Super::inputLength) {}
+		
+		void reset(Sample value=Sample()) {
+			buffer.reset(value);
+		}
+		void resize(int minCapacity, Sample value=Sample()) {
+			buffer.resize(minCapacity + Super::inputLength, value);
+		}
+		
+		/** Read a sample from `delaySamples` >= 0 in the past.
+		The interpolator may add its own latency on top of this (see `Delay::latency`).  The default interpolation (linear) has 0 latency.
+		*/
+		Sample read(Sample delaySamples) const {
+			return Super::read(buffer, delaySamples);
+		}
+		/// Writes a sample. Returns the same object, so that you can say `delay.write(v).read(delay)`.
+		Delay & write(Sample value) {
+			++buffer;
+			buffer[0] = value;
+			return *this;
+		}
+	};
+
+	/**	@brief A multi-channel delay-line with its own buffer. */
+	template<class Sample, template<typename> class Interpolator=InterpolatorLinear>
+	class MultiDelay : private Reader<Sample, Interpolator> {
+		using Super = Reader<Sample, Interpolator>;
+		int channels;
+		MultiBuffer<Sample> multiBuffer;
+	public:
+		static constexpr Sample latency = Super::latency;
+
+		MultiDelay(int channels=0, int capacity=0) : channels(channels), multiBuffer(channels, 1 + capacity + Super::inputLength) {}
+
+		void reset(Sample value=Sample()) {
+			multiBuffer.reset(value);
+		}
+		void resize(int nChannels, int capacity, Sample value=Sample()) {
+			channels = nChannels;
+			multiBuffer.resize(channels, capacity + Super::inputLength, value);
+		}
+		
+		/// A single-channel delay-line view, similar to a `const Delay`
+		struct ChannelView {
+			static constexpr Sample latency = Super::latency;
+
+			const Super &reader;
+			typename MultiBuffer<Sample>::ConstChannel channel;
+			
+			Sample read(Sample delaySamples) const {
+				return reader.read(channel, delaySamples);
+			}
+		};
+		ChannelView operator [](int channel) const {
+			return ChannelView{*this, multiBuffer[channel]};
+		}
+
+		/// A multi-channel result, lazily calculating samples
+		struct DelayView {
+			Super &reader;
+			typename MultiBuffer<Sample>::ConstView view;
+			Sample delaySamples;
+			
+			// Calculate samples on-the-fly
+			Sample operator [](int c) const {
+				return reader.read(view[c], delaySamples);
+			}
+		};
+		DelayView read(Sample delaySamples) {
+			return DelayView{*this, multiBuffer.constView(), delaySamples};
+		}
+		/// Reads into the provided output structure
+		template<class Output>
+		void read(Sample delaySamples, Output &output) {
+			for (int c = 0; c < channels; ++c) {
+				output[c] = Super::read(multiBuffer[c], delaySamples);
+			}
+		}
+		/// Reads separate delays for each channel
+		template<class Delays, class Output>
+		void readMulti(const Delays &delays, Output &output) {
+			for (int c = 0; c < channels; ++c) {
+				output[c] = Super::read(multiBuffer[c], delays[c]);
+			}
+		}
+		template<class Data>
+		MultiDelay & write(const Data &data) {
+			++multiBuffer;
+			for (int c = 0; c < channels; ++c) {
+				multiBuffer[c][0] = data[c];
+			}
+			return *this;
+		}
+	};
+
+/** @} */
+}} // signalsmith::delay::
+#endif // include guard
diff --git a/dsp/envelopes.h b/dsp/envelopes.h
new file mode 100644
index 0000000..a833c51
--- /dev/null
+++ b/dsp/envelopes.h
@@ -0,0 +1,523 @@
+#ifndef SIGNALSMITH_DSP_ENVELOPES_H
+#define SIGNALSMITH_DSP_ENVELOPES_H
+
+#include "./common.h"
+
+#include <cmath>
+#include <random>
+#include <vector>
+#include <iterator>
+
+namespace signalsmith {
+namespace envelopes {
+	/**	@defgroup Envelopes Envelopes and LFOs
+		@brief LFOs, envelopes and filters for manipulating them
+		
+		@{
+		@file
+	*/
+	
+	/**	An LFO based on cubic segments.
+		You can randomise the rate and/or the depth.  Randomising the depth past `0.5` means it no longer neatly alternates sides:
+			\diagram{cubic-lfo-example.svg,Some example LFO curves.}
+		Without randomisation, it is approximately sine-like:
+			\diagram{cubic-lfo-spectrum-pure.svg}
+	*/
+	class CubicLfo {
+		float ratio = 0;
+		float ratioStep = 0;
+		
+		float valueFrom = 0, valueTo = 1, valueRange = 1;
+		float targetLow = 0, targetHigh = 1;
+		float targetRate = 0;
+		float rateRandom = 0.5, depthRandom = 0;
+		bool freshReset = true;
+		
+		std::default_random_engine randomEngine;
+		std::uniform_real_distribution<float> randomUnit;
+		float random() {
+			return randomUnit(randomEngine);
+		}
+		float randomRate() {
+			return targetRate*exp(rateRandom*(random() - 0.5));
+		}
+		float randomTarget(float previous) {
+			float randomOffset = depthRandom*random()*(targetLow - targetHigh);
+			if (previous < (targetLow + targetHigh)*0.5f) {
+				return targetHigh + randomOffset;
+			} else {
+				return targetLow - randomOffset;
+			}
+		}
+	public:
+		CubicLfo() : randomEngine(std::random_device()()), randomUnit(0, 1) {
+			reset();
+		}
+		CubicLfo(long seed) : randomUnit(0, 1) {
+			randomEngine.seed(seed);
+			reset();
+		}
+
+		/// Resets the LFO state, starting with random phase.
+		void reset() {
+			ratio = random();
+			ratioStep = randomRate();
+			if (random() < 0.5) {
+				valueFrom = targetLow;
+				valueTo = targetHigh;
+			} else {
+				valueFrom = targetHigh;
+				valueTo = targetLow;
+			}
+			valueRange = valueTo - valueFrom;
+			freshReset = true;
+		}
+		/** Smoothly updates the LFO parameters.
+
+		If called directly after `.reset()`, oscillation will immediately start within the specified range.  Otherwise, it will remain smooth and fit within the new range after at most one cycle:
+			\diagram{cubic-lfo-changes.svg}
+
+		The LFO will complete a full oscillation in (approximately) `1/rate` samples.  `rateVariation` can be any number, but 0-1 is a good range.
+		
+		`depthVariation` must be in the range [0, 1], where ≤ 0.5 produces random amplitude but still alternates up/down.
+			\diagram{cubic-lfo-spectrum.svg,Spectra for the two types of randomisation - note the jump as depth variation goes past 50%}
+		*/
+		void set(float low, float high, float rate, float rateVariation=0, float depthVariation=0) {
+			rate *= 2; // We want to go up and down during this period
+			targetRate = rate;
+			targetLow = std::min(low, high);
+			targetHigh = std::max(low, high);
+			rateRandom = rateVariation;
+			depthRandom = std::min<float>(1, std::max<float>(0, depthVariation));
+			
+			// If we haven't called .next() yet, don't bother being smooth.
+			if (freshReset) return reset();
+
+			// Only update the current rate if it's outside our new random-variation range
+			float maxRandomRatio = exp((float)0.5*rateRandom);
+			if (ratioStep > rate*maxRandomRatio || ratioStep < rate/maxRandomRatio) {
+				ratioStep = randomRate();
+			}
+		}
+		
+		/// Returns the next output sample
+		float next() {
+			freshReset = false;
+			float result = ratio*ratio*(3 - 2*ratio)*valueRange + valueFrom;
+
+			ratio += ratioStep;
+			while (ratio >= 1) {
+				ratio -= 1;
+				ratioStep = randomRate();
+				valueFrom = valueTo;
+				valueTo = randomTarget(valueFrom);
+				valueRange = valueTo - valueFrom;
+			}
+			return result;
+		}
+	};
+	
+	/** Variable-width rectangular sum */
+	template<typename Sample=double>
+	class BoxSum {
+		int bufferLength, index;
+		std::vector<Sample> buffer;
+		Sample sum = 0, wrapJump = 0;
+	public:
+		BoxSum(int maxLength) {
+			resize(maxLength);
+		}
+
+		/// Sets the maximum size (and reset contents)
+		void resize(int maxLength) {
+			bufferLength = maxLength + 1;
+			buffer.resize(bufferLength);
+			if (maxLength != 0) buffer.shrink_to_fit();
+			reset();
+		}
+		
+		/// Resets (with an optional "fill" value)
+		void reset(Sample value=Sample()) {
+			index = 0;
+			sum = 0;
+			for (size_t i = 0; i < buffer.size(); ++i) {
+				buffer[i] = sum;
+				sum += value;
+			}
+			wrapJump = sum;
+			sum = 0;
+		}
+		
+		Sample read(int width) {
+			int readIndex = index - width;
+			double result = sum;
+			if (readIndex < 0) {
+				result += wrapJump;
+				readIndex += bufferLength;
+			}
+			return result - buffer[readIndex];
+		}
+		
+		void write(Sample value) {
+			++index;
+			if (index == bufferLength) {
+				index = 0;
+				wrapJump = sum;
+				sum = 0;
+			}
+			sum += value;
+			buffer[index] = sum;
+		}
+
+		Sample readWrite(Sample value, int width) {
+			write(value);
+			return read(width);
+		}
+	};
+	
+	/** Rectangular moving average filter (FIR).
+		\diagram{box-filter-example.svg}
+		A filter of length 1 has order 0 (i.e. does nothing). */
+	template<typename Sample=double>
+	class BoxFilter {
+		BoxSum<Sample> boxSum;
+		int _length, _maxLength;
+		Sample multiplier;
+	public:
+		BoxFilter(int maxLength) : boxSum(maxLength) {
+			resize(maxLength);
+		}
+		/// Sets the maximum size (and current size, and resets)
+		void resize(int maxLength) {
+			_maxLength = maxLength;
+			boxSum.resize(maxLength);
+			set(maxLength);
+		}
+		/// Sets the current size (expanding/allocating only if needed)
+		void set(int length) {
+			_length = length;
+			multiplier = Sample(1)/length;
+			if (length > _maxLength) resize(length);
+		}
+		
+		/// Resets (with an optional "fill" value)
+		void reset(Sample fill=Sample()) {
+			boxSum.reset(fill);
+		}
+		
+		Sample operator()(Sample v) {
+			return boxSum.readWrite(v, _length)*multiplier;
+		}
+	};
+
+	/** FIR filter made from a stack of `BoxFilter`s.
+		This filter has a non-negative impulse (monotonic step response), making it useful for smoothing positive-only values.  It provides an optimal set of box-lengths, chosen to minimise peaks in the stop-band:
+			\diagram{box-stack-long.svg,Impulse responses for various stack sizes at length N=1000}
+		Since the underlying box-averages must have integer width, the peaks are slightly higher for shorter lengths with higher numbers of layers:
+			\diagram{box-stack-short-freq.svg,Frequency responses for various stack sizes at length N=30}
+	*/
+	template<typename Sample=double>
+	class BoxStackFilter {
+		struct Layer {
+			double ratio = 0, lengthError = 0;
+			int length = 0;
+			BoxFilter<Sample> filter{0};
+			Layer() {}
+		};
+		int _size;
+		std::vector<Layer> layers;
+
+		template<class Iterable>
+		void setupLayers(const Iterable &ratios) {
+			layers.resize(0);
+			double sum = 0;
+			for (auto ratio : ratios) {
+				Layer layer;
+				layer.ratio = ratio;
+				layers.push_back(layer);
+				sum += ratio;
+			}
+			double factor = 1/sum;
+			for (auto &l : layers) {
+				l.ratio *= factor;
+			}
+		}
+	public:
+		BoxStackFilter(int maxSize, int layers=4) {
+			resize(maxSize, layers);
+		}
+
+		/// Returns an optimal set of length ratios (heuristic for larger depths)
+		static std::vector<double> optimalRatios(int layerCount) {
+			// Coefficients up to 6, found through numerical search
+			static double hardcoded[] = {1, 0.58224186169, 0.41775813831, 0.404078562416, 0.334851475794, 0.261069961789, 0.307944914938, 0.27369945234, 0.22913263601, 0.189222996712, 0.248329349789, 0.229253789144, 0.201191468123, 0.173033035122, 0.148192357821, 0.205275202874, 0.198413552119, 0.178256637764, 0.157821404506, 0.138663023387, 0.121570179349 /*, 0.178479592135, 0.171760666359, 0.158434068954, 0.143107825806, 0.125907148711, 0.11853946895, 0.103771229086, 0.155427880834, 0.153063152848, 0.142803459422, 0.131358358458, 0.104157805178, 0.119338029601, 0.0901675284678, 0.103683785192, 0.143949349747, 0.139813248378, 0.132051305252, 0.122216776152, 0.112888320989, 0.102534988632, 0.0928386714364, 0.0719750997699, 0.0817322396428, 0.130587011572, 0.127244563184, 0.121228748787, 0.113509941974, 0.105000272288, 0.0961938290157, 0.0880639725438, 0.0738389766046, 0.0746781936619, 0.0696544903682 */};
+			if (layerCount <= 0) {
+				return {};
+			} else if (layerCount <= 6) {
+				double *start = &hardcoded[layerCount*(layerCount - 1)/2];
+				return std::vector<double>(start, start + layerCount);
+			}
+			std::vector<double> result(layerCount);
+
+			double invN = 1.0/layerCount, sqrtN = std::sqrt(layerCount);
+			double p = 1 - invN;
+			double k = 1 + 4.5/sqrtN + 0.08*sqrtN;
+
+			double sum = 0;
+			for (int i = 0; i < layerCount; ++i) {
+				double x = i*invN;
+				double power = -x*(1 - p*std::exp(-x*k));
+				double length = std::pow(2, power);
+				result[i] = length;
+				sum += length;
+			}
+			double factor = 1/sum;
+			for (auto &r : result) r *= factor;
+			return result;
+		}
+		/** Approximate (optimal) bandwidth for a given number of layers
+		\diagram{box-stack-bandwidth.svg,Approximate main lobe width (bandwidth)}
+		*/
+		static constexpr double layersToBandwidth(int layers) {
+			return 1.58*(layers + 0.1);
+		}
+		/** Approximate (optimal) peak in the stop-band
+		\diagram{box-stack-peak.svg,Heuristic stop-band peak}
+		*/
+		static constexpr double layersToPeakDb(int layers) {
+			return 5 - layers*18;
+		}
+		
+		/// Sets size using an optimal (heuristic at larger sizes) set of length ratios
+		void resize(int maxSize, int layerCount) {
+			resize(maxSize, optimalRatios(layerCount));
+		}
+		/// Sets the maximum (and current) impulse response length and explicit length ratios
+		template<class List>
+		auto resize(int maxSize, List ratios) -> decltype(void(std::begin(ratios)), void(std::end(ratios))) {
+			setupLayers(ratios);
+			for (auto &layer : layers) layer.filter.resize(0); // .set() will expand it later
+			_size = -1;
+			set(maxSize);
+			reset();
+		}
+		void resize(int maxSize, std::initializer_list<double> ratios) {
+			resize<const std::initializer_list<double> &>(maxSize, ratios);
+		}
+		
+		/// Sets the impulse response length (does not reset if `size` ≤ `maxSize`)
+		void set(int size) {
+			if (layers.size() == 0) return; // meaningless
+
+			if (_size == size) return;
+			_size = size;
+			int order = size - 1;
+			int totalOrder  = 0;
+			
+			for (auto &layer : layers) {
+				double layerOrderFractional = layer.ratio*order;
+				int layerOrder = int(layerOrderFractional);
+				layer.length = layerOrder + 1;
+				layer.lengthError = layerOrder - layerOrderFractional;
+				totalOrder += layerOrder;
+			}
+			// Round some of them up, so the total is correct - this is O(N²), but `layers.size()` is small
+			while (totalOrder < order) {
+				int minIndex = 0;
+				double minError = layers[0].lengthError;
+				for (size_t i = 1; i < layers.size(); ++i) {
+					if (layers[i].lengthError < minError) {
+						minError = layers[i].lengthError;
+						minIndex = i;
+					}
+				}
+				layers[minIndex].length++;
+				layers[minIndex].lengthError += 1;
+				totalOrder++;
+			}
+			for (auto &layer : layers) layer.filter.set(layer.length);
+		}
+
+		/// Resets the filter
+		void reset(Sample fill=Sample()) {
+			for (auto &layer : layers) layer.filter.reset(fill);
+		}
+		
+		Sample operator()(Sample v) {
+			for (auto &layer : layers) {
+				v = layer.filter(v);
+			}
+			return v;
+		}
+	};
+	
+	/** Peak-hold filter.
+		\diagram{peak-hold.svg}
+		
+		The size is variable, and can be changed instantly with `.set()`, or by using `.push()`/`.pop()` in an unbalanced way.
+
+		This has complexity O(1) every sample when the length remains constant (balanced `.push()`/`.pop()`, or using `filter(v)`), and amortised O(1) complexity otherwise.  To avoid allocations while running, it pre-allocates a vector (not a `std::deque`) which determines the maximum length.
+	*/
+	template<typename Sample>
+	class PeakHold {
+		static constexpr Sample lowest = std::numeric_limits<Sample>::lowest();
+		int bufferMask;
+		std::vector<Sample> buffer;
+		int backIndex = 0, middleStart = 0, workingIndex = 0, middleEnd = 0, frontIndex = 0;
+		Sample frontMax = lowest, workingMax = lowest, middleMax = lowest;
+		
+	public:
+		PeakHold(int maxLength) {
+			resize(maxLength);
+		}
+		int size() {
+			return frontIndex - backIndex;
+		}
+		void resize(int maxLength) {
+			int bufferLength = 1;
+			while (bufferLength < maxLength) bufferLength *= 2;
+			buffer.resize(bufferLength);
+			bufferMask = bufferLength - 1;
+			
+			frontIndex = backIndex + maxLength;
+			reset();
+		}
+		void reset(Sample fill=lowest) {
+			int prevSize = size();
+			buffer.assign(buffer.size(), fill);
+			frontMax = workingMax = middleMax = lowest;
+			middleEnd = workingIndex = frontIndex = 0;
+			middleStart = middleEnd - (prevSize/2);
+			backIndex = frontIndex - prevSize;
+		}
+		/** Sets the size immediately.
+		Must be `0 <= newSize <= maxLength` (see constructor and `.resize()`).
+		
+		Shrinking doesn't destroy information, and if you expand again (with `preserveCurrentPeak=false`), you will get the same output as before shrinking.  Expanding when `preserveCurrentPeak` is enabled is destructive, re-writing its history such that the current output value is unchanged.*/
+		void set(int newSize, bool preserveCurrentPeak=false) {
+			while (size() < newSize) {
+				Sample &backPrev = buffer[backIndex&bufferMask];
+				--backIndex;
+				Sample &back = buffer[backIndex&bufferMask];
+				back = preserveCurrentPeak ? backPrev : std::max(back, backPrev);
+			}
+			while (size() > newSize) {
+				pop();
+			}
+		}
+		
+		void push(Sample v) {
+			buffer[frontIndex&bufferMask] = v;
+			++frontIndex;
+			frontMax = std::max(frontMax, v);
+		}
+		void pop() {
+			if (backIndex == middleStart) {
+				// Move along the maximums
+				workingMax = lowest;
+				middleMax = frontMax;
+				frontMax = lowest;
+
+				int prevFrontLength = frontIndex - middleEnd;
+				int prevMiddleLength = middleEnd - middleStart;
+				if (prevFrontLength <= prevMiddleLength + 1) {
+					// Swap over simply
+					middleStart = middleEnd;
+					middleEnd = frontIndex;
+					workingIndex = middleEnd;
+				} else {
+					// The front is longer than the middle - only happens if unbalanced
+					// We don't move *all* of the front over, keeping half the surplus in the front
+					int middleLength = (frontIndex - middleStart)/2;
+					middleStart = middleEnd;
+					middleEnd += middleLength;
+
+					// Working index is close enough that it will be finished by the time the back is empty
+					int backLength = middleStart - backIndex;
+					int workingLength = std::min(backLength, middleEnd - middleStart);
+					workingIndex = middleStart + workingLength;
+
+					// Since the front was not completely consumed, we re-calculate the front's maximum
+					for (int i = middleEnd; i != frontIndex; ++i) {
+						frontMax = std::max(frontMax, buffer[i&bufferMask]);
+					}
+					// The index might not start at the end of the working block - compute the last bit immediately
+					for (int i = middleEnd - 1; i != workingIndex - 1; --i) {
+						buffer[i&bufferMask] = workingMax = std::max(workingMax, buffer[i&bufferMask]);
+					}
+				}
+
+				// Is the new back (previous middle) empty? Only happens if unbalanced
+				if (backIndex == middleStart) {
+					 // swap over again (front's empty, no change)
+					workingMax = lowest;
+					middleMax = frontMax;
+					frontMax = lowest;
+					middleStart = workingIndex = middleEnd;
+	
+					if (backIndex == middleStart) {
+						--backIndex; // Only happens if you pop from an empty list - fail nicely
+					}
+				}
+				
+				buffer[frontIndex&bufferMask] = lowest; // In case of length 0, when everything points at this value
+			}
+
+			++backIndex;
+			if (workingIndex != middleStart) {
+				--workingIndex;
+				buffer[workingIndex&bufferMask] = workingMax = std::max(workingMax, buffer[workingIndex&bufferMask]);
+			}
+		}
+		Sample read() {
+			Sample backMax = buffer[backIndex&bufferMask];
+			return std::max(backMax, std::max(middleMax, frontMax));
+		}
+		
+		// For simple use as a constant-length filter
+		Sample operator ()(Sample v) {
+			push(v);
+			pop();
+			return read();
+		}
+	};
+	
+	/** Peak-decay filter with a linear shape and fixed-time return to constant value.
+		\diagram{peak-decay-linear.svg}
+		This is equivalent to a `BoxFilter` which resets itself whenever the output would be less than the input.
+	*/
+	template<typename Sample=double>
+	class PeakDecayLinear {
+		static constexpr Sample lowest = std::numeric_limits<Sample>::lowest();
+		PeakHold<Sample> peakHold;
+		Sample value = lowest;
+		Sample stepMultiplier = 1;
+	public:
+		PeakDecayLinear(int maxLength) : peakHold(maxLength) {
+			set(maxLength);
+		}
+		void resize(int maxLength) {
+			peakHold.resize(maxLength);
+			reset();
+		}
+		void set(double length) {
+			peakHold.set(std::ceil(length));
+			// Overshoot slightly but don't exceed 1
+			stepMultiplier = Sample(1.0001)/std::max(1.0001, length);
+		}
+		void reset(Sample start=lowest) {
+			peakHold.reset(start);
+			set(peakHold.size());
+			value = start;
+		}
+		
+		Sample operator ()(Sample v) {
+			Sample peak = peakHold.read();
+			peakHold(v);
+			return value = std::max<Sample>(v, value + (v - peak)*stepMultiplier);
+		}
+	};
+
+/** @} */
+}} // signalsmith::envelopes::
+#endif // include guard
diff --git a/dsp/fft.h b/dsp/fft.h
new file mode 100644
index 0000000..fcaf4f0
--- /dev/null
+++ b/dsp/fft.h
@@ -0,0 +1,520 @@
+#ifndef SIGNALSMITH_FFT_V5
+#define SIGNALSMITH_FFT_V5
+
+#include "./common.h"
+#include "./perf.h"
+
+#include <vector>
+#include <complex>
+#include <cmath>
+
+namespace signalsmith { namespace fft {
+	/**	@defgroup FFT FFT (complex and real)
+		@brief Fourier transforms (complex and real)
+
+		@{
+		@file
+	*/
+
+	namespace _fft_impl {
+
+		template <typename V>
+		SIGNALSMITH_INLINE V complexReal(const std::complex<V> &c) {
+			return ((V*)(&c))[0];
+		}
+		template <typename V>
+		SIGNALSMITH_INLINE V complexImag(const std::complex<V> &c) {
+			return ((V*)(&c))[1];
+		}
+
+		// Complex multiplication has edge-cases around Inf/NaN - handling those properly makes std::complex non-inlineable, so we use our own
+		template <bool conjugateSecond, typename V>
+		SIGNALSMITH_INLINE std::complex<V> complexMul(const std::complex<V> &a, const std::complex<V> &b) {
+			V aReal = complexReal(a), aImag = complexImag(a);
+			V bReal = complexReal(b), bImag = complexImag(b);
+			return conjugateSecond ? std::complex<V>{
+				bReal*aReal + bImag*aImag,
+				bReal*aImag - bImag*aReal
+			} : std::complex<V>{
+				aReal*bReal - aImag*bImag,
+				aReal*bImag + aImag*bReal
+			};
+		}
+
+		template<bool flipped, typename V>
+		SIGNALSMITH_INLINE std::complex<V> complexAddI(const std::complex<V> &a, const std::complex<V> &b) {
+			V aReal = complexReal(a), aImag = complexImag(a);
+			V bReal = complexReal(b), bImag = complexImag(b);
+			return flipped ? std::complex<V>{
+				aReal + bImag,
+				aImag - bReal
+			} : std::complex<V>{
+				aReal - bImag,
+				aImag + bReal
+			};
+		}
+
+		// Use SFINAE to get an iterator from std::begin(), if supported - otherwise assume the value itself is an iterator
+		template<typename T, typename=void>
+		struct GetIterator {
+			static T get(const T &t) {
+				return t;
+			}
+		};
+		template<typename T>
+		struct GetIterator<T, decltype((void)std::begin(std::declval<T>()))> {
+			static auto get(const T &t) -> decltype(std::begin(t)) {
+				return std::begin(t);
+			}
+		};
+	}
+
+	/** Floating-point FFT implementation.
+	It is fast for 2^a * 3^b.
+	Here are the peak and RMS errors for `float`/`double` computation:
+	\diagram{fft-errors.svg Simulated errors for pure-tone harmonic inputs\, compared to a theoretical upper bound from "Roundoff error analysis of the fast Fourier transform" (G. Ramos, 1971)}
+	*/
+	template<typename V=double>
+	class FFT {
+		using complex = std::complex<V>;
+		size_t _size;
+		std::vector<complex> workingVector;
+		
+		enum class StepType {
+			generic, step2, step3, step4
+		};
+		struct Step {
+			StepType type;
+			size_t factor;
+			size_t startIndex;
+			size_t innerRepeats;
+			size_t outerRepeats;
+			size_t twiddleIndex;
+		};
+		std::vector<size_t> factors;
+		std::vector<Step> plan;
+		std::vector<complex> twiddleVector;
+		
+		struct PermutationPair {size_t from, to;};
+		std::vector<PermutationPair> permutation;
+		
+		void addPlanSteps(size_t factorIndex, size_t start, size_t length, size_t repeats) {
+			if (factorIndex >= factors.size()) return;
+			
+			size_t factor = factors[factorIndex];
+			if (factorIndex + 1 < factors.size()) {
+				if (factors[factorIndex] == 2 && factors[factorIndex + 1] == 2) {
+					++factorIndex;
+					factor = 4;
+				}
+			}
+
+			size_t subLength = length/factor;
+			Step mainStep{StepType::generic, factor, start, subLength, repeats, twiddleVector.size()};
+
+			if (factor == 2) mainStep.type = StepType::step2;
+			if (factor == 3) mainStep.type = StepType::step3;
+			if (factor == 4) mainStep.type = StepType::step4;
+
+			// Twiddles
+			bool foundStep = false;
+			for (const Step &existingStep : plan) {
+				if (existingStep.factor == mainStep.factor && existingStep.innerRepeats == mainStep.innerRepeats) {
+					foundStep = true;
+					mainStep.twiddleIndex = existingStep.twiddleIndex;
+					break;
+				}
+			}
+			if (!foundStep) {
+				for (size_t i = 0; i < subLength; ++i) {
+					for (size_t f = 0; f < factor; ++f) {
+						double phase = 2*M_PI*i*f/length;
+						complex twiddle = {V(std::cos(phase)), V(-std::sin(phase))};
+						twiddleVector.push_back(twiddle);
+					}
+				}
+			}
+
+			if (repeats == 1 && sizeof(complex)*subLength > 65536) {
+				for (size_t i = 0; i < factor; ++i) {
+					addPlanSteps(factorIndex + 1, start + i*subLength, subLength, 1);
+				}
+			} else {
+				addPlanSteps(factorIndex + 1, start, subLength, repeats*factor);
+			}
+			plan.push_back(mainStep);
+		}
+		void setPlan() {
+			factors.resize(0);
+			size_t size = _size, factor = 2;
+			while (size > 1) {
+				if (size%factor == 0) {
+					factors.push_back(factor);
+					size /= factor;
+				} else if (factor > sqrt(size)) {
+					factor = size;
+				} else {
+					++factor;
+				}
+			}
+
+			plan.resize(0);
+			twiddleVector.resize(0);
+			addPlanSteps(0, 0, _size, 1);
+			
+			permutation.resize(0);
+			permutation.push_back(PermutationPair{0, 0});
+			size_t indexLow = 0, indexHigh = factors.size();
+			size_t inputStepLow = _size, outputStepLow = 1;
+			size_t inputStepHigh = 1, outputStepHigh = _size;
+			while (outputStepLow*inputStepHigh < _size) {
+				size_t f, inputStep, outputStep;
+				if (outputStepLow <= inputStepHigh) {
+					f = factors[indexLow++];
+					inputStep = (inputStepLow /= f);
+					outputStep = outputStepLow;
+					outputStepLow *= f;
+				} else {
+					f = factors[--indexHigh];
+					inputStep = inputStepHigh;
+					inputStepHigh *= f;
+					outputStep = (outputStepHigh /= f);
+				}
+				size_t oldSize = permutation.size();
+				for (size_t i = 1; i < f; ++i) {
+					for (size_t j = 0; j < oldSize; ++j) {
+						PermutationPair pair = permutation[j];
+						pair.from += i*inputStep;
+						pair.to += i*outputStep;
+						permutation.push_back(pair);
+					}
+				}
+			}
+		}
+
+		template<bool inverse, typename RandomAccessIterator>
+		void fftStepGeneric(RandomAccessIterator &&origData, const Step &step) {
+			complex *working = workingVector.data();
+			const size_t stride = step.innerRepeats;
+
+			for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) {
+				RandomAccessIterator data = origData;
+				
+				const complex *twiddles = twiddleVector.data() + step.twiddleIndex;
+				const size_t factor = step.factor;
+				for (size_t repeat = 0; repeat < step.innerRepeats; ++repeat) {
+					for (size_t i = 0; i < step.factor; ++i) {
+						working[i] = _fft_impl::complexMul<inverse>(data[i*stride], twiddles[i]);
+					}
+					for (size_t f = 0; f < factor; ++f) {
+						complex sum = working[0];
+						for (size_t i = 1; i < factor; ++i) {
+							double phase = 2*M_PI*f*i/factor;
+							complex twiddle = {V(std::cos(phase)), V(-std::sin(phase))};
+							sum += _fft_impl::complexMul<inverse>(working[i], twiddle);
+						}
+						data[f*stride] = sum;
+					}
+					++data;
+					twiddles += factor;
+				}
+				origData += step.factor*step.innerRepeats;
+			}
+		}
+
+		template<bool inverse, typename RandomAccessIterator>
+		SIGNALSMITH_INLINE void fftStep2(RandomAccessIterator &&origData, const Step &step) {
+			const size_t stride = step.innerRepeats;
+			const complex *origTwiddles = twiddleVector.data() + step.twiddleIndex;
+			for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) {
+				const complex* twiddles = origTwiddles;
+				for (RandomAccessIterator data = origData; data < origData + stride; ++data) {
+					complex A = data[0];
+					complex B = _fft_impl::complexMul<inverse>(data[stride], twiddles[1]);
+					
+					data[0] = A + B;
+					data[stride] = A - B;
+					twiddles += 2;
+				}
+				origData += 2*stride;
+			}
+		}
+
+		template<bool inverse, typename RandomAccessIterator>
+		SIGNALSMITH_INLINE void fftStep3(RandomAccessIterator &&origData, const Step &step) {
+			constexpr complex factor3 = {-0.5, inverse ? 0.8660254037844386 : -0.8660254037844386};
+			const size_t stride = step.innerRepeats;
+			const complex *origTwiddles = twiddleVector.data() + step.twiddleIndex;
+			
+			for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) {
+				const complex* twiddles = origTwiddles;
+				for (RandomAccessIterator data = origData; data < origData + stride; ++data) {
+					complex A = data[0];
+					complex B = _fft_impl::complexMul<inverse>(data[stride], twiddles[1]);
+					complex C = _fft_impl::complexMul<inverse>(data[stride*2], twiddles[2]);
+					
+					complex realSum = A + (B + C)*factor3.real();
+					complex imagSum = (B - C)*factor3.imag();
+
+					data[0] = A + B + C;
+					data[stride] = _fft_impl::complexAddI<false>(realSum, imagSum);
+					data[stride*2] = _fft_impl::complexAddI<true>(realSum, imagSum);
+
+					twiddles += 3;
+				}
+				origData += 3*stride;
+			}
+		}
+
+		template<bool inverse, typename RandomAccessIterator>
+		SIGNALSMITH_INLINE void fftStep4(RandomAccessIterator &&origData, const Step &step) {
+			const size_t stride = step.innerRepeats;
+			const complex *origTwiddles = twiddleVector.data() + step.twiddleIndex;
+			
+			for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) {
+				const complex* twiddles = origTwiddles;
+				for (RandomAccessIterator data = origData; data < origData + stride; ++data) {
+					complex A = data[0];
+					complex C = _fft_impl::complexMul<inverse>(data[stride], twiddles[2]);
+					complex B = _fft_impl::complexMul<inverse>(data[stride*2], twiddles[1]);
+					complex D = _fft_impl::complexMul<inverse>(data[stride*3], twiddles[3]);
+
+					complex sumAC = A + C, sumBD = B + D;
+					complex diffAC = A - C, diffBD = B - D;
+
+					data[0] = sumAC + sumBD;
+					data[stride] = _fft_impl::complexAddI<!inverse>(diffAC, diffBD);
+					data[stride*2] = sumAC - sumBD;
+					data[stride*3] = _fft_impl::complexAddI<inverse>(diffAC, diffBD);
+
+					twiddles += 4;
+				}
+				origData += 4*stride;
+			}
+		}
+		
+		template<typename InputIterator, typename OutputIterator>
+		void permute(InputIterator input, OutputIterator data) {
+			for (auto pair : permutation) {
+				data[pair.from] = input[pair.to];
+			}
+		}
+
+		template<bool inverse, typename InputIterator, typename OutputIterator>
+		void run(InputIterator &&input, OutputIterator &&data) {
+			permute(input, data);
+			
+			for (const Step &step : plan) {
+				switch (step.type) {
+					case StepType::generic:
+						fftStepGeneric<inverse>(data + step.startIndex, step);
+						break;
+					case StepType::step2:
+						fftStep2<inverse>(data + step.startIndex, step);
+						break;
+					case StepType::step3:
+						fftStep3<inverse>(data + step.startIndex, step);
+						break;
+					case StepType::step4:
+						fftStep4<inverse>(data + step.startIndex, step);
+						break;
+				}
+			}
+		}
+
+		static bool validSize(size_t size) {
+			constexpr static bool filter[32] = {
+				1, 1, 1, 1, 1, 0, 1, 0, 1, 1, // 0-9
+				0, 0, 1, 0, 0, 0, 1, 0, 1, 0, // 10-19
+				0, 0, 0, 0, 1, 0, 0, 0, 0, 0, // 20-29
+				0, 0
+			};
+			return filter[size];
+		}
+	public:
+		static size_t fastSizeAbove(size_t size) {
+			size_t power2 = 1;
+			while (size >= 32) {
+				size = (size - 1)/2 + 1;
+				power2 *= 2;
+			}
+			while (size < 32 && !validSize(size)) {
+				++size;
+			}
+			return power2*size;
+		}
+		static size_t fastSizeBelow(size_t size) {
+			size_t power2 = 1;
+			while (size >= 32) {
+				size /= 2;
+				power2 *= 2;
+			}
+			while (size > 1 && !validSize(size)) {
+				--size;
+			}
+			return power2*size;
+		}
+
+		FFT(size_t size, int fastDirection=0) : _size(0) {
+			if (fastDirection > 0) size = fastSizeAbove(size);
+			if (fastDirection < 0) size = fastSizeBelow(size);
+			this->setSize(size);
+		}
+
+		size_t setSize(size_t size) {
+			if (size != _size) {
+				_size = size;
+				workingVector.resize(size);
+				setPlan();
+			}
+			return _size;
+		}
+		size_t setFastSizeAbove(size_t size) {
+			return setSize(fastSizeAbove(size));
+		}
+		size_t setFastSizeBelow(size_t size) {
+			return setSize(fastSizeBelow(size));
+		}
+		const size_t & size() const {
+			return _size;
+		}
+
+		template<typename InputIterator, typename OutputIterator>
+		void fft(InputIterator &&input, OutputIterator &&output) {
+			auto inputIter = _fft_impl::GetIterator<InputIterator>::get(input);
+			auto outputIter = _fft_impl::GetIterator<OutputIterator>::get(output);
+			return run<false>(inputIter, outputIter);
+		}
+
+		template<typename InputIterator, typename OutputIterator>
+		void ifft(InputIterator &&input, OutputIterator &&output) {
+			auto inputIter = _fft_impl::GetIterator<InputIterator>::get(input);
+			auto outputIter = _fft_impl::GetIterator<OutputIterator>::get(output);
+			return run<true>(inputIter, outputIter);
+		}
+	};
+
+	struct FFTOptions {
+		static constexpr int halfFreqShift = 1;
+	};
+
+	template<typename V, int optionFlags=0>
+	class RealFFT {
+		static constexpr bool modified = (optionFlags&FFTOptions::halfFreqShift);
+
+		using complex = std::complex<V>;
+		std::vector<complex> complexBuffer1, complexBuffer2;
+		std::vector<complex> twiddlesMinusI;
+		std::vector<complex> modifiedRotations;
+		FFT<V> complexFft;
+	public:
+		static size_t fastSizeAbove(size_t size) {
+			return FFT<V>::fastSizeAbove((size + 1)/2)*2;
+		}
+		static size_t fastSizeBelow(size_t size) {
+			return FFT<V>::fastSizeBelow(size/2)*2;
+		}
+
+		RealFFT(size_t size=0, int fastDirection=0) : complexFft(0) {
+			if (fastDirection > 0) size = fastSizeAbove(size);
+			if (fastDirection < 0) size = fastSizeBelow(size);
+			this->setSize(std::max<size_t>(size, 2));
+		}
+
+		size_t setSize(size_t size) {
+			complexBuffer1.resize(size/2);
+			complexBuffer2.resize(size/2);
+
+			size_t hhSize = size/4 + 1;
+			twiddlesMinusI.resize(hhSize);
+			for (size_t i = 0; i < hhSize; ++i) {
+				V rotPhase = -2*M_PI*(modified ? i + 0.5 : i)/size;
+				twiddlesMinusI[i] = {std::sin(rotPhase), -std::cos(rotPhase)};
+			}
+			if (modified) {
+				modifiedRotations.resize(size/2);
+				for (size_t i = 0; i < size/2; ++i) {
+					V rotPhase = -2*M_PI*i/size;
+					modifiedRotations[i] = {std::cos(rotPhase), std::sin(rotPhase)};
+				}
+			}
+			
+			return complexFft.setSize(size/2);
+		}
+		size_t setFastSizeAbove(size_t size) {
+			return setSize(fastSizeAbove(size));
+		}
+		size_t setFastSizeBelow(size_t size) {
+			return setSize(fastSizeBelow(size));
+		}
+		size_t size() const {
+			return complexFft.size()*2;
+		}
+
+		template<typename InputIterator, typename OutputIterator>
+		void fft(InputIterator &&input, OutputIterator &&output) {
+			size_t hSize = complexFft.size();
+			for (size_t i = 0; i < hSize; ++i) {
+				if (modified) {
+					complexBuffer1[i] = _fft_impl::complexMul<false>({input[2*i], input[2*i + 1]}, modifiedRotations[i]);
+				} else {
+					complexBuffer1[i] = {input[2*i], input[2*i + 1]};
+				}
+			}
+			
+			complexFft.fft(complexBuffer1.data(), complexBuffer2.data());
+			
+			if (!modified) output[0] = {
+				complexBuffer2[0].real() + complexBuffer2[0].imag(),
+				complexBuffer2[0].real() - complexBuffer2[0].imag()
+			};
+			for (size_t i = modified ? 0 : 1; i <= hSize/2; ++i) {
+				size_t conjI = modified ? (hSize  - 1 - i) : (hSize - i);
+				
+				complex odd = (complexBuffer2[i] + conj(complexBuffer2[conjI]))*(V)0.5;
+				complex evenI = (complexBuffer2[i] - conj(complexBuffer2[conjI]))*(V)0.5;
+				complex evenRotMinusI = _fft_impl::complexMul<false>(evenI, twiddlesMinusI[i]);
+
+				output[i] = odd + evenRotMinusI;
+				output[conjI] = conj(odd - evenRotMinusI);
+			}
+		}
+
+		template<typename InputIterator, typename OutputIterator>
+		void ifft(InputIterator &&input, OutputIterator &&output) {
+			size_t hSize = complexFft.size();
+			if (!modified) complexBuffer1[0] = {
+				input[0].real() + input[0].imag(),
+				input[0].real() - input[0].imag()
+			};
+			for (size_t i = modified ? 0 : 1; i <= hSize/2; ++i) {
+				size_t conjI = modified ? (hSize  - 1 - i) : (hSize - i);
+				complex v = input[i], v2 = input[conjI];
+
+				complex odd = v + conj(v2);
+				complex evenRotMinusI = v - conj(v2);
+				complex evenI = _fft_impl::complexMul<true>(evenRotMinusI, twiddlesMinusI[i]);
+				
+				complexBuffer1[i] = odd + evenI;
+				complexBuffer1[conjI] = conj(odd - evenI);
+			}
+			
+			complexFft.ifft(complexBuffer1.data(), complexBuffer2.data());
+			
+			for (size_t i = 0; i < hSize; ++i) {
+				complex v = complexBuffer2[i];
+				if (modified) v = _fft_impl::complexMul<true>(v, modifiedRotations[i]);
+				output[2*i] = v.real();
+				output[2*i + 1] = v.imag();
+			}
+		}
+	};
+
+	template<typename V>
+	struct ModifiedRealFFT : public RealFFT<V, FFTOptions::halfFreqShift> {
+		using RealFFT<V, FFTOptions::halfFreqShift>::RealFFT;
+	};
+
+/// @}
+}} // namespace
+#endif // include guard
diff --git a/dsp/filters.h b/dsp/filters.h
new file mode 100644
index 0000000..9545062
--- /dev/null
+++ b/dsp/filters.h
@@ -0,0 +1,434 @@
+#ifndef SIGNALSMITH_DSP_FILTERS_H
+#define SIGNALSMITH_DSP_FILTERS_H
+
+#include "./common.h"
+#include "./perf.h"
+
+#include <cmath>
+#include <complex>
+
+namespace signalsmith {
+namespace filters {
+	/**	@defgroup Filters Basic filters
+		@brief Classes for some common filter types
+		
+		@{
+		@file
+	*/
+	
+	/** Filter design methods.
+		These differ mostly in how they handle frequency-warping near Nyquist:
+		\diagram{filters-lowpass.svg}
+		\diagram{filters-highpass.svg}
+		\diagram{filters-peak.svg}
+		\diagram{filters-bandpass.svg}
+		\diagram{filters-notch.svg}
+		\diagram{filters-high-shelf.svg}
+		\diagram{filters-low-shelf.svg}
+		\diagram{filters-allpass.svg}
+	*/
+	enum class BiquadDesign {
+		bilinear, ///< Bilinear transform, adjusting for centre frequency but not bandwidth
+ 		cookbook, ///< RBJ's "Audio EQ Cookbook".  Based on `bilinear`, adjusting bandwidth (for peak/notch/bandpass) to preserve the ratio between upper/lower boundaries.  This performs oddly near Nyquist.
+		oneSided, ///< Based on `bilinear`, adjusting bandwidth to preserve the lower boundary (leaving the upper one loose).
+		vicanek ///< From Martin Vicanek's [Matched Second Order Digital Filters](https://vicanek.de/articles/BiquadFits.pdf).  Falls back to `oneSided` for shelf and allpass filters.  This takes the poles from the impulse-invariant approach, and then picks the zeros to create a better match.  This means that Nyquist is not 0dB for peak/notch (or -Inf for lowpass), but it is a decent match to the analogue prototype.
+	};
+	
+	/** A standard biquad.
+
+		This is not guaranteed to be stable if modulated at audio rate.
+		
+		The default highpass/lowpass bandwidth (`defaultBandwidth`) produces a Butterworth filter when bandwidth-compensation is disabled.
+		
+		Bandwidth compensation defaults to `BiquadDesign::oneSided` (or `BiquadDesign::cookbook` if `cookbookBandwidth` is enabled) for all filter types aside from highpass/lowpass (which use `BiquadDesign::bilinear`).*/
+	template<typename Sample, bool cookbookBandwidth=false>
+	class BiquadStatic {
+		static constexpr BiquadDesign bwDesign = cookbookBandwidth ? BiquadDesign::cookbook : BiquadDesign::oneSided;
+		Sample a1 = 0, a2 = 0, b0 = 1, b1 = 0, b2 = 0;
+		Sample x1 = 0, x2 = 0, y1 = 0, y2 = 0;
+		
+		enum class Type {highpass, lowpass, highShelf, lowShelf, bandpass, notch, peak, allpass};
+
+		struct FreqSpec {
+			double scaledFreq;
+			double w0, sinW0, cosW0;
+			double inv2Q;
+			
+			FreqSpec(double scaledFreq, BiquadDesign design) {
+				this->scaledFreq = scaledFreq = std::max(1e-6, std::min(0.4999, scaledFreq));
+				if (design == BiquadDesign::cookbook) {
+					// Falls apart a bit near Nyquist
+					this->scaledFreq = scaledFreq = std::min(0.45, scaledFreq);
+				}
+				w0 = 2*M_PI*scaledFreq;
+				cosW0 = std::cos(w0);
+				sinW0 = std::sin(w0);
+			}
+			
+			void oneSidedCompQ() {
+				// Ratio between our (digital) lower boundary f1 and centre f0
+				double f1Factor = std::sqrt(inv2Q*inv2Q + 1) - inv2Q;
+				// Bilinear means discrete-time freq f = continuous-time freq tan(pi*xf/pi)
+				double ctF1 = std::tan(M_PI*scaledFreq*f1Factor), invCtF0 = (1 + cosW0)/sinW0;
+				double ctF1Factor = ctF1*invCtF0;
+				inv2Q = 0.5/ctF1Factor - 0.5*ctF1Factor;
+			}
+		};
+		SIGNALSMITH_INLINE static FreqSpec octaveSpec(double scaledFreq, double octaves, BiquadDesign design) {
+			FreqSpec spec(scaledFreq, design);
+
+			if (design == BiquadDesign::cookbook) {
+				// Approximately preserves bandwidth between halfway points
+				octaves *= spec.w0/spec.sinW0;
+			}
+			spec.inv2Q = std::sinh(std::log(2)*0.5*octaves); // 1/(2Q)
+			if (design == BiquadDesign::oneSided) spec.oneSidedCompQ();
+			return spec;
+		}
+		SIGNALSMITH_INLINE static FreqSpec qSpec(double scaledFreq, double q, BiquadDesign design) {
+			FreqSpec spec(scaledFreq, design);
+
+			spec.inv2Q = 0.5/q;
+			if (design == BiquadDesign::oneSided) spec.oneSidedCompQ();
+			return spec;
+		}
+		
+		SIGNALSMITH_INLINE double dbToSqrtGain(double db) {
+			return std::pow(10, db*0.025);
+		}
+		
+		SIGNALSMITH_INLINE BiquadStatic & configure(Type type, FreqSpec calc, double sqrtGain, BiquadDesign design) {
+			double w0 = calc.w0;
+			
+			if (design == BiquadDesign::vicanek) {
+				if (type == Type::notch) { // Heuristic for notches near Nyquist
+					calc.inv2Q *= (1 - calc.scaledFreq*0.5);
+				}
+				double Q = (type == Type::peak ? 0.5*sqrtGain : 0.5)/calc.inv2Q;
+				double q = (type == Type::peak ? 1/sqrtGain : 1)*calc.inv2Q;
+				double expmqw = std::exp(-q*w0);
+				if (q <= 1) {
+					a1 = -2*expmqw*std::cos(std::sqrt(1 - q*q)*w0);
+				} else {
+					a1 = -2*expmqw*std::cosh(std::sqrt(q*q - 1)*w0);
+				}
+				a2 = expmqw*expmqw;
+				double sinpd2 = std::sin(w0/2);
+				double p0 = 1 - sinpd2*sinpd2, p1 = sinpd2*sinpd2, p2 = 4*p0*p1;
+				double A0 = 1 + a1 + a2, A1 = 1 - a1 + a2, A2 = -4*a2;
+				A0 *= A0;
+				A1 *= A1;
+				if (type == Type::lowpass) {
+					double R1 = (A0*p0 + A1*p1 + A2*p2)*Q*Q;
+					double B0 = A0, B1 = (R1 - B0*p0)/p1;
+					b0 = 0.5*(std::sqrt(B0) + std::sqrt(B1));
+					b1 = std::sqrt(B0) - b0;
+					b2 = 0;
+					return *this;
+				} else if (type == Type::highpass) {
+					b2 = b0 = std::sqrt(A0*p0 + A1*p1 + A2*p2)*Q/(4*p1);
+					b1 = -2*b0;
+					return *this;
+				} else if (type == Type::bandpass) {
+					double R1 = A0*p0 + A1*p1 + A2*p2;
+					double R2 = -A0 + A1 + 4*(p0 - p1)*A2;
+					double B2 = (R1 - R2*p1)/(4*p1*p1);
+					double B1 = R2 + 4*(p1 - p0)*B2;
+					b1 = -0.5*std::sqrt(B1);
+					b0 = 0.5*(std::sqrt(B2 + 0.25*B1) - b1);
+					b2 = -b0 - b1;
+					return *this;
+				} else if (type == Type::notch) {
+					// The Vicanek paper doesn't cover notches (band-stop), but we know where the zeros should be:
+					b0 = 1;
+					b1 = -2*std::cos(w0);
+					b2 = 1;
+					// Scale so that B0 == A0 to get 0dB at f=0
+					double scale = std::sqrt(A0)/(b0 + b1 + b2);
+					b0 *= scale;
+					b1 *= scale;
+					b2 *= scale;
+					return *this;
+				} else if (type == Type::peak) {
+					double G2 = (sqrtGain*sqrtGain)*(sqrtGain*sqrtGain);
+					double R1 = (A0*p0 + A1*p1 + A2*p2)*G2;
+					double R2 = (-A0 + A1 + 4*(p0 - p1)*A2)*G2;
+					double B0 = A0;
+					double B2 = (R1 - R2*p1 - B0)/(4*p1*p1);
+					double B1 = R2 + B0 + 4*(p1 - p0)*B2;
+					double W = 0.5*(std::sqrt(B0) + std::sqrt(B1));
+					b0 = 0.5*(W + std::sqrt(W*W + B2));
+					b1 = 0.5*(std::sqrt(B0) - std::sqrt(B1));
+					b2 = -B2/(4*b0);
+					return *this;
+				}
+				// All others fall back to `oneSided`
+				design = BiquadDesign::oneSided;
+				calc.oneSidedCompQ();
+			}
+
+			double alpha = calc.sinW0*calc.inv2Q;
+			double A = sqrtGain, sqrtA2alpha = 2*std::sqrt(A)*alpha;
+
+			double a0;
+			if (type == Type::highpass) {
+				b1 = -1 - calc.cosW0;
+				b0 = b2 = (1 + calc.cosW0)*0.5;
+				a0 = 1 + alpha;
+				a1 = -2*calc.cosW0;
+				a2 = 1 - alpha;
+			} else if (type == Type::lowpass) {
+				b1 = 1 - calc.cosW0;
+				b0 = b2 = b1*0.5;
+				a0 = 1 + alpha;
+				a1 = -2*calc.cosW0;
+				a2 = 1 - alpha;
+			} else if (type == Type::highShelf) {
+				b0 = A*((A+1)+(A-1)*calc.cosW0+sqrtA2alpha);
+				b2 = A*((A+1)+(A-1)*calc.cosW0-sqrtA2alpha);
+				b1 = -2*A*((A-1)+(A+1)*calc.cosW0);
+				a0 = (A+1)-(A-1)*calc.cosW0+sqrtA2alpha;
+				a2 = (A+1)-(A-1)*calc.cosW0-sqrtA2alpha;
+				a1 = 2*((A-1)-(A+1)*calc.cosW0);
+			} else if (type == Type::lowShelf) {
+				b0 = A*((A+1)-(A-1)*calc.cosW0+sqrtA2alpha);
+				b2 = A*((A+1)-(A-1)*calc.cosW0-sqrtA2alpha);
+				b1 = 2*A*((A-1)-(A+1)*calc.cosW0);
+				a0 = (A+1)+(A-1)*calc.cosW0+sqrtA2alpha;
+				a2 = (A+1)+(A-1)*calc.cosW0-sqrtA2alpha;
+				a1 = -2*((A-1)+(A+1)*calc.cosW0);
+			} else if (type == Type::bandpass) {
+				b0 = alpha;
+				b1 = 0;
+				b2 = -alpha;
+				a0 = 1 + alpha;
+				a1 = -2*calc.cosW0;
+				a2 = 1 - alpha;
+			} else if (type == Type::notch) {
+				b0 = 1;
+				b1 = -2*calc.cosW0;
+				b2 = 1;
+				a0 = 1 + alpha;
+				a1 = b1;
+				a2 = 1 - alpha;
+			} else if (type == Type::peak) {
+				b0 = 1 + alpha*A;
+				b1 = -2*calc.cosW0;
+				b2 = 1 - alpha*A;
+				a0 = 1 + alpha/A;
+				a1 = b1;
+				a2 = 1 - alpha/A;
+			} else if (type == Type::allpass) {
+				a0 = b2 = 1 + alpha;
+				a1 = b1 = -2*calc.cosW0;
+				a2 = b0 = 1 - alpha;
+			} else {
+				// reset to neutral
+				a1 = a2 = b1 = b2 = 0;
+				a0 = b0 = 1;
+			}
+			double invA0 = 1/a0;
+			b0 *= invA0;
+			b1 *= invA0;
+			b2 *= invA0;
+			a1 *= invA0;
+			a2 *= invA0;
+			return *this;
+		}
+	public:
+		static constexpr double defaultQ = 0.7071067811865476; // sqrt(0.5)
+		static constexpr double defaultBandwidth = 1.8999686269529916; // equivalent to above Q
+
+		Sample operator ()(Sample x0) {
+			Sample y0 = x0*b0 + x1*b1 + x2*b2 - y1*a1 - y2*a2;
+			y2 = y1;
+			y1 = y0;
+			x2 = x1;
+			x1 = x0;
+			return y0;
+		}
+		
+		void reset() {
+			x1 = x2 = y1 = y2 = 0;
+		}
+		
+		std::complex<Sample> response(Sample scaledFreq) const {
+			Sample w = scaledFreq*Sample(2*M_PI);
+			std::complex<Sample> invZ = {std::cos(w), -std::sin(w)}, invZ2 = invZ*invZ;
+			return (b0 + invZ*b1 + invZ2*b2)/(Sample(1) + invZ*a1 + invZ2*a2);
+		}
+		Sample responseDb(Sample scaledFreq) const {
+			Sample w = scaledFreq*Sample(2*M_PI);
+			std::complex<Sample> invZ = {std::cos(w), -std::sin(w)}, invZ2 = invZ*invZ;
+			Sample energy = std::norm(b0 + invZ*b1 + invZ2*b2)/std::norm(Sample(1) + invZ*a1 + invZ2*a2);
+			return 10*std::log10(energy);
+		}
+
+		/// @name Lowpass
+		/// @{
+		BiquadStatic & lowpass(double scaledFreq, double octaves=defaultBandwidth, BiquadDesign design=BiquadDesign::bilinear) {
+			return configure(Type::lowpass, octaveSpec(scaledFreq, octaves, design), 0, design);
+		}
+		BiquadStatic & lowpassQ(double scaledFreq, double q, BiquadDesign design=BiquadDesign::bilinear) {
+			return configure(Type::lowpass, qSpec(scaledFreq, q, design), 0, design);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		void lowpass(double scaledFreq, double octaves, bool correctBandwidth) {
+			lowpass(scaledFreq, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @deprecated By the time you care about `design`, you should care about the bandwidth
+		BiquadStatic & lowpass(double scaledFreq, BiquadDesign design) {
+			return lowpass(scaledFreq, defaultBandwidth, design);
+		}
+		/// @}
+		
+		/// @name Highpass
+		/// @{
+		BiquadStatic & highpass(double scaledFreq, double octaves=defaultBandwidth, BiquadDesign design=BiquadDesign::bilinear) {
+			return configure(Type::highpass, octaveSpec(scaledFreq, octaves, design), 0, design);
+		}
+		BiquadStatic & highpassQ(double scaledFreq, double q, BiquadDesign design=BiquadDesign::bilinear) {
+			return configure(Type::highpass, qSpec(scaledFreq, q, design), 0, design);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		void highpass(double scaledFreq, double octaves, bool correctBandwidth) {
+			highpass(scaledFreq, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @deprecated By the time you care about `design`, you should care about the bandwidth
+		BiquadStatic & highpass(double scaledFreq, BiquadDesign design) {
+			return highpass(scaledFreq, defaultBandwidth, design);
+		}
+		/// @}
+
+		/// @name Bandpass
+		/// @{
+		BiquadStatic & bandpass(double scaledFreq, double octaves=defaultBandwidth, BiquadDesign design=bwDesign) {
+			return configure(Type::bandpass, octaveSpec(scaledFreq, octaves, design), 0, design);
+		}
+		BiquadStatic & bandpassQ(double scaledFreq, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::bandpass, qSpec(scaledFreq, q, design), 0, design);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		void bandpass(double scaledFreq, double octaves, bool correctBandwidth) {
+			bandpass(scaledFreq, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @deprecated By the time you care about `design`, you should care about the bandwidth
+		BiquadStatic & bandpass(double scaledFreq, BiquadDesign design) {
+			return bandpass(scaledFreq, defaultBandwidth, design);
+		}
+		/// @}
+
+		/// @name Notch
+		/// @{
+		BiquadStatic & notch(double scaledFreq, double octaves=defaultBandwidth, BiquadDesign design=bwDesign) {
+			return configure(Type::notch, octaveSpec(scaledFreq, octaves, design), 0, design);
+		}
+		BiquadStatic & notchQ(double scaledFreq, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::notch, qSpec(scaledFreq, q, design), 0, design);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		void notch(double scaledFreq, double octaves, bool correctBandwidth) {
+			notch(scaledFreq, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @deprecated By the time you care about `design`, you should care about the bandwidth
+		BiquadStatic & notch(double scaledFreq, BiquadDesign design) {
+			return notch(scaledFreq, defaultBandwidth, design);
+		}
+		/// @deprecated alias for `.notch()`
+		void bandStop(double scaledFreq, double octaves=1, bool correctBandwidth=true) {
+			notch(scaledFreq, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @}
+
+		/// @name Peak
+		/// @{
+		BiquadStatic & peak(double scaledFreq, double gain, double octaves=1, BiquadDesign design=bwDesign) {
+			return configure(Type::peak, octaveSpec(scaledFreq, octaves, design), std::sqrt(gain), design);
+		}
+		BiquadStatic & peakDb(double scaledFreq, double db, double octaves=1, BiquadDesign design=bwDesign) {
+			return configure(Type::peak, octaveSpec(scaledFreq, octaves, design), dbToSqrtGain(db), design);
+		}
+		BiquadStatic & peakQ(double scaledFreq, double gain, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::peak, qSpec(scaledFreq, q, design), std::sqrt(gain), design);
+		}
+		BiquadStatic & peakDbQ(double scaledFreq, double db, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::peak, qSpec(scaledFreq, q, design), dbToSqrtGain(db), design);
+		}
+		/// @deprecated By the time you care about `design`, you should care about the bandwidth
+		BiquadStatic & peak(double scaledFreq, double gain, BiquadDesign design) {
+			return peak(scaledFreq, gain, 1, design);
+		}
+		/// @}
+
+		/// @name High shelf
+		/// @{
+		BiquadStatic & highShelf(double scaledFreq, double gain, double octaves=defaultBandwidth, BiquadDesign design=bwDesign) {
+			return configure(Type::highShelf, octaveSpec(scaledFreq, octaves, design), std::sqrt(gain), design);
+		}
+		BiquadStatic & highShelfDb(double scaledFreq, double db, double octaves=defaultBandwidth, BiquadDesign design=bwDesign) {
+			return configure(Type::highShelf, octaveSpec(scaledFreq, octaves, design), dbToSqrtGain(db), design);
+		}
+		BiquadStatic & highShelfQ(double scaledFreq, double gain, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::highShelf, qSpec(scaledFreq, q, design), std::sqrt(gain), design);
+		}
+		BiquadStatic & highShelfDbQ(double scaledFreq, double db, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::highShelf, qSpec(scaledFreq, q, design), dbToSqrtGain(db), design);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		BiquadStatic & highShelf(double scaledFreq, double gain, double octaves, bool correctBandwidth) {
+			return highShelf(scaledFreq, gain, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		BiquadStatic & highShelfDb(double scaledFreq, double db, double octaves, bool correctBandwidth) {
+			return highShelfDb(scaledFreq, db, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @}
+
+		/// @name Low shelf
+		/// @{
+		BiquadStatic & lowShelf(double scaledFreq, double gain, double octaves=2, BiquadDesign design=bwDesign) {
+			return configure(Type::lowShelf, octaveSpec(scaledFreq, octaves, design), std::sqrt(gain), design);
+		}
+		BiquadStatic & lowShelfDb(double scaledFreq, double db, double octaves=2, BiquadDesign design=bwDesign) {
+			return configure(Type::lowShelf, octaveSpec(scaledFreq, octaves, design), dbToSqrtGain(db), design);
+		}
+		BiquadStatic & lowShelfQ(double scaledFreq, double gain, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::lowShelf, qSpec(scaledFreq, q, design), std::sqrt(gain), design);
+		}
+		BiquadStatic & lowShelfDbQ(double scaledFreq, double db, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::lowShelf, qSpec(scaledFreq, q, design), dbToSqrtGain(db), design);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		BiquadStatic & lowShelf(double scaledFreq, double gain, double octaves, bool correctBandwidth) {
+			return lowShelf(scaledFreq, gain, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @deprecated use `BiquadDesign` instead
+		BiquadStatic & lowShelfDb(double scaledFreq, double db, double octaves, bool correctBandwidth) {
+			return lowShelfDb(scaledFreq, db, octaves, correctBandwidth ? bwDesign : BiquadDesign::bilinear);
+		}
+		/// @}
+		
+		/// @name Allpass
+		/// @{
+		BiquadStatic & allpass(double scaledFreq, double octaves=1, BiquadDesign design=bwDesign) {
+			return configure(Type::allpass, octaveSpec(scaledFreq, octaves, design), 0, design);
+		}
+		BiquadStatic & allpassQ(double scaledFreq, double q, BiquadDesign design=bwDesign) {
+			return configure(Type::allpass, qSpec(scaledFreq, q, design), 0, design);
+		}
+		/// @}
+
+		BiquadStatic & addGain(double factor) {
+			b0 *= factor;
+			b1 *= factor;
+			b2 *= factor;
+			return *this;
+		}
+		BiquadStatic & addGainDb(double db) {
+			return addGain(std::pow(10, db*0.05));
+		}
+	};
+
+	/** @} */
+}} // signalsmith::filters::
+#endif // include guard
diff --git a/dsp/mix.h b/dsp/mix.h
new file mode 100644
index 0000000..5b85a3c
--- /dev/null
+++ b/dsp/mix.h
@@ -0,0 +1,218 @@
+#ifndef SIGNALSMITH_DSP_MULTI_CHANNEL_H
+#define SIGNALSMITH_DSP_MULTI_CHANNEL_H
+
+#include "./common.h"
+
+#include <array>
+
+namespace signalsmith {
+namespace mix {
+	/**	@defgroup Mix Multichannel mixing
+		@brief Utilities for stereo/multichannel mixing operations
+
+		@{
+		@file
+	*/
+
+	/** @defgroup Matrices Orthogonal matrices
+		@brief Some common matrices used for audio
+		@ingroup Mix
+		@{ */
+
+	/// @brief Hadamard: high mixing levels, N log(N) operations
+	template<typename Sample, int size=-1>
+	class Hadamard {
+	public:
+		static_assert(size >= 0, "Size must be positive (or -1 for dynamic)");
+		/// Applies the matrix, scaled so it's orthogonal
+		template<class Data>
+		static void inPlace(Data &&data) {
+			unscaledInPlace(data);
+			
+			Sample factor = scalingFactor();
+			for (int c = 0; c < size; ++c) {
+				data[c] *= factor;
+			}
+		}
+
+		/// Scaling factor applied to make it orthogonal
+		static Sample scalingFactor() {
+			/// TODO: test for C++20, or whatever makes this constexpr.  Maybe a `#define` in `common.h`?
+			return std::sqrt(Sample(1)/(size ? size : 1));
+		}
+
+		/// Skips the scaling, so it's a matrix full of `1`s
+		template<class Data, int startIndex=0>
+		static void unscaledInPlace(Data &&data) {
+			if (size <= 1) return;
+			constexpr int hSize = size/2;
+
+			Hadamard<Sample, hSize>::template unscaledInPlace<Data, startIndex>(data);
+			Hadamard<Sample, hSize>::template unscaledInPlace<Data, startIndex + hSize>(data);
+
+			for (int i = 0; i < hSize; ++i) {
+				Sample a = data[i + startIndex], b = data[i + startIndex + hSize];
+				data[i + startIndex] = (a + b);
+				data[i + startIndex + hSize] = (a - b);
+			}
+		}
+	};
+	/// @brief Hadamard with dynamic size
+	template<typename Sample>
+	class Hadamard<Sample, -1> {
+		int size;
+	public:
+		Hadamard(int size) : size(size) {}
+		
+		/// Applies the matrix, scaled so it's orthogonal
+		template<class Data>
+		void inPlace(Data &&data) const {
+			unscaledInPlace(data);
+			
+			Sample factor = scalingFactor();
+			for (int c = 0; c < size; ++c) {
+				data[c] *= factor;
+			}
+		}
+
+		/// Scaling factor applied to make it orthogonal
+		Sample scalingFactor() const {
+			return std::sqrt(Sample(1)/(size ? size : 1));
+		}
+
+		/// Skips the scaling, so it's a matrix full of `1`s
+		template<class Data>
+		void unscaledInPlace(Data &&data) const {
+			int hSize = size/2;
+			while (hSize > 0) {
+				for (int startIndex = 0; startIndex < size; startIndex += hSize*2) {
+					for (int i = startIndex; i < startIndex + hSize; ++i) {
+						Sample a = data[i], b = data[i + hSize];
+						data[i] = (a + b);
+						data[i + hSize] = (a - b);
+					}
+				}
+				hSize /= 2;
+			}
+		}
+	};
+	/// @brief Householder: moderate mixing, 2N operations
+	template<typename Sample, int size=-1>
+	class Householder {
+	public:
+		static_assert(size >= 0, "Size must be positive (or -1 for dynamic)");
+		template<class Data>
+		static void inPlace(Data &&data) {
+			if (size < 1) return;
+			/// TODO: test for C++20, which makes `std::complex::operator/` constexpr
+			const Sample factor = Sample(-2)/Sample(size ? size : 1);
+
+			Sample sum = data[0];
+			for (int i = 1; i < size; ++i) {
+				sum += data[i];
+			}
+			sum *= factor;
+			for (int i = 0; i < size; ++i) {
+				data[i] += sum;
+			}
+		}
+		/// @deprecated The matrix is already orthogonal, but this is here for compatibility with Hadamard
+		constexpr static Sample scalingFactor() {
+			return 1;
+		}
+	};
+	/// @brief Householder with dynamic size
+	template<typename Sample>
+	class Householder<Sample, -1> {
+		int size;
+	public:
+		Householder(int size) : size(size) {}
+	
+		template<class Data>
+		void inPlace(Data &&data) const {
+			if (size < 1) return;
+			const Sample factor = Sample(-2)/Sample(size ? size : 1);
+
+			Sample sum = data[0];
+			for (int i = 1; i < size; ++i) {
+				sum += data[i];
+			}
+			sum *= factor;
+			for (int i = 0; i < size; ++i) {
+				data[i] += sum;
+			}
+		}
+		/// @deprecated The matrix is already orthogonal, but this is here for compatibility with Hadamard
+		constexpr static Sample scalingFactor() {
+			return 1;
+		}
+	};
+	///  @}
+	
+	/** @brief Upmix/downmix a stereo signal to an (even) multi-channel signal
+		
+		When spreading out, it rotates the input by various amounts (e.g. a four-channel signal would produce `(left, right, mid side)`), such that energy is preserved for each pair.
+		
+		When mixing together, it uses the opposite rotations, such that upmix → downmix produces the same stereo signal (when scaled by `.scalingFactor1()`.
+	*/
+	template<typename Sample, int channels>
+	class StereoMultiMixer {
+		static_assert((channels/2)*2 == channels, "StereoMultiMixer must have an even number of channels");
+		static_assert(channels > 0, "StereoMultiMixer must have a positive number of channels");
+		static constexpr int hChannels = channels/2;
+		std::array<Sample, channels> coeffs;
+	public:
+		StereoMultiMixer() {
+			coeffs[0] = 1;
+			coeffs[1] = 0;
+			for (int i = 1; i < hChannels; ++i) {
+				double phase = M_PI*i/channels;
+				coeffs[2*i] = std::cos(phase);
+				coeffs[2*i + 1] = std::sin(phase);
+			}
+		}
+		
+		template<class In, class Out>
+		void stereoToMulti(In &input, Out &output) const {
+			output[0] = input[0];
+			output[1] = input[1];
+			for (int i = 2; i < channels; i += 2) {
+				output[i] = input[0]*coeffs[i] + input[1]*coeffs[i + 1];
+				output[i + 1] = input[1]*coeffs[i] - input[0]*coeffs[i + 1];
+			}
+		}
+		template<class In, class Out>
+		void multiToStereo(In &input, Out &output) const {
+			output[0] = input[0];
+			output[1] = input[1];
+			for (int i = 2; i < channels; i += 2) {
+				output[0] += input[i]*coeffs[i] - input[i + 1]*coeffs[i + 1];
+				output[1] += input[i + 1]*coeffs[i] + input[i]*coeffs[i + 1];
+			}
+		}
+		/// Scaling factor for the downmix, if channels are phase-aligned
+		static constexpr Sample scalingFactor1() {
+			return 2/Sample(channels);
+		}
+		/// Scaling factor for the downmix, if channels are independent
+		static Sample scalingFactor2() {
+			return std::sqrt(scalingFactor1());
+		}
+	};
+	
+	/// A cheap (polynomial) almost-energy-preserving crossfade
+	/// Maximum energy error: 1.06%, average 0.64%, curves overshoot by 0.3%
+	/// See: http://signalsmith-audio.co.uk/writing/2021/cheap-energy-crossfade/
+	template<typename Sample, typename Result>
+	void cheapEnergyCrossfade(Sample x, Result &toCoeff, Result &fromCoeff) {
+		Sample x2 = 1 - x;
+		// Other powers p can be approximated by: k = -6.0026608 + p*(6.8773512 - 1.5838104*p)
+		Sample A = x*x2, B = A*(1 + (Sample)1.4186*A);
+		Sample C = (B + x), D = (B + x2);
+		toCoeff = C*C;
+		fromCoeff = D*D;
+	}
+
+/** @} */
+}} // signalsmith::delay::
+#endif // include guard
diff --git a/dsp/perf.h b/dsp/perf.h
new file mode 100644
index 0000000..ab72821
--- /dev/null
+++ b/dsp/perf.h
@@ -0,0 +1,43 @@
+#ifndef SIGNALSMITH_DSP_PERF_H
+#define SIGNALSMITH_DSP_PERF_H
+
+#include <complex>
+
+namespace signalsmith {
+namespace perf {
+	/**	@defgroup Performance Performance helpers
+		@brief Nothing serious, just some `#defines` and helpers
+		
+		@{
+		@file
+	*/
+		
+	/// *Really* insist that a function/method is inlined (mostly for performance in DEBUG builds)
+	#ifndef SIGNALSMITH_INLINE
+	#ifdef __GNUC__
+	#define SIGNALSMITH_INLINE __attribute__((always_inline)) inline
+	#elif defined(__MSVC__)
+	#define SIGNALSMITH_INLINE __forceinline inline
+	#else
+	#define SIGNALSMITH_INLINE inline
+	#endif
+	#endif
+
+	/** @brief Complex-multiplication (with optional conjugate second-arg), without handling NaN/Infinity
+		The `std::complex` multiplication has edge-cases around NaNs which slow things down and prevent auto-vectorisation.
+	*/
+	template <bool conjugateSecond=false, typename V>
+	SIGNALSMITH_INLINE static std::complex<V> mul(const std::complex<V> &a, const std::complex<V> &b) {
+		return conjugateSecond ? std::complex<V>{
+			b.real()*a.real() + b.imag()*a.imag(),
+			b.real()*a.imag() - b.imag()*a.real()
+		} : std::complex<V>{
+			a.real()*b.real() - a.imag()*b.imag(),
+			a.real()*b.imag() + a.imag()*b.real()
+		};
+	}
+
+/** @} */
+}} // signalsmith::perf::
+
+#endif // include guard
diff --git a/dsp/spectral.h b/dsp/spectral.h
new file mode 100644
index 0000000..823571d
--- /dev/null
+++ b/dsp/spectral.h
@@ -0,0 +1,455 @@
+#ifndef SIGNALSMITH_DSP_SPECTRAL_H
+#define SIGNALSMITH_DSP_SPECTRAL_H
+
+#include "./common.h"
+#include "./perf.h"
+#include "./fft.h"
+#include "./windows.h"
+#include "./delay.h"
+
+#include <cmath>
+
+namespace signalsmith {
+namespace spectral {
+	/**	@defgroup Spectral Spectral Processing
+		@brief Tools for frequency-domain manipulation of audio signals
+		
+		@{
+		@file
+	*/
+	
+	/** @brief An FFT with built-in windowing and round-trip scaling
+	
+		This uses a Modified Real FFT, which applies half-bin shift before the transform.  The result therefore has `N/2` bins, centred at the frequencies: `(i + 0.5)/N`.
+		
+		This avoids the awkward (real-valued) bands for DC-offset and Nyquist.
+	 */
+	template<typename Sample>
+	class WindowedFFT {
+		using MRFFT = signalsmith::fft::ModifiedRealFFT<Sample>;
+		using Complex = std::complex<Sample>;
+		MRFFT mrfft{2};
+
+		std::vector<Sample> fftWindow;
+		std::vector<Sample> timeBuffer;
+		std::vector<Complex> freqBuffer;
+	public:
+		/// Returns a fast FFT size <= `size`
+		static int fastSizeAbove(int size, int divisor=1) {
+			return MRFFT::fastSizeAbove(size/divisor)*divisor;
+		}
+		/// Returns a fast FFT size >= `size`
+		static int fastSizeBelow(int size, int divisor=1) {
+			return MRFFT::fastSizeBelow(1 + (size - 1)/divisor)*divisor;
+		}
+
+		WindowedFFT() {}
+		WindowedFFT(int size) {
+			setSize(size);
+		}
+		template<class WindowFn>
+		WindowedFFT(int size, WindowFn fn, Sample windowOffset=0.5) {
+			setSize(size, fn, windowOffset);
+		}
+
+		/// Sets the size, returning the window for modification (initially all 1s)
+		std::vector<Sample> & setSizeWindow(int size) {
+			mrfft.setSize(size);
+			fftWindow.resize(size, 1);
+			timeBuffer.resize(size);
+			freqBuffer.resize(size);
+			return fftWindow;
+		}
+		/// Sets the FFT size, with a user-defined functor for the window
+		template<class WindowFn>
+		void setSize(int size, WindowFn fn, Sample windowOffset=0.5) {
+			setSizeWindow(size);
+		
+			Sample invSize = 1/(Sample)size;
+			for (int i = 0; i < size; ++i) {
+				Sample r = (i + windowOffset)*invSize;
+				fftWindow[i] = fn(r);
+			}
+		}
+		/// Sets the size (using the default Blackman-Harris window)
+		void setSize(int size) {
+			setSize(size, [](double x) {
+				double phase = 2*M_PI*x;
+				// Blackman-Harris
+				return 0.35875 + 0.48829*std::cos(phase) + 0.14128*std::cos(phase*2) + 0.1168*std::cos(phase*3);
+			});
+		}
+
+		const std::vector<Sample> & window() const {
+			return this->fftWindow;
+		}
+		int size() const {
+			return mrfft.size();
+		}
+		
+		/// Performs an FFT (with windowing)
+		template<class Input, class Output>
+		void fft(Input &&input, Output &&output) {
+			struct WindowedInput {
+				const Input &input;
+				std::vector<Sample> &window;
+				SIGNALSMITH_INLINE Sample operator [](int i) {
+					return input[i]*window[i];
+				}
+			};
+		
+			mrfft.fft(WindowedInput{input, fftWindow}, output);
+		}
+		/// Performs an FFT (no windowing)
+		template<class Input, class Output>
+		void fftRaw(Input &&input, Output &&output) {
+			mrfft.fft(input, output);
+		}
+
+		/// Inverse FFT, with windowing and 1/N scaling
+		template<class Input, class Output>
+		void ifft(Input &&input, Output &&output) {
+			mrfft.ifft(input, output);
+			int size = mrfft.size();
+			Sample norm = 1/(Sample)size;
+			for (int i = 0; i < size; ++i) {
+				output[i] *= norm*fftWindow[i];
+			}
+		}
+		/// Performs an IFFT (no windowing)
+		template<class Input, class Output>
+		void ifftRaw(Input &&input, Output &&output) {
+			mrfft.ifft(input, output);
+		}
+	};
+	
+	/** STFT synthesis, built on a `MultiBuffer`.
+ 
+		Any window length and block interval is supported, but the FFT size may be rounded up to a faster size (by zero-padding).  It uses a heuristically-optimal Kaiser window modified for perfect-reconstruction.
+		
+		\diagram{stft-aliasing-simulated.svg,Simulated bad-case aliasing (random phase-shift for each band) for overlapping ratios}
+
+		There is a "latest valid index", and you can read the output up to one `historyLength` behind this (see `.resize()`).  You can read up to one window-length _ahead_ to get partially-summed future output.
+		
+		\diagram{stft-buffer-validity.svg}
+		
+		You move the valid index along using `.ensureValid()`, passing in a functor which provides spectra (using `.analyse()` and/or direct modification through `.spectrum[c]`):
+
+		\code
+			void processSample(...) {
+				stft.ensureValid([&](int) {
+					// Here, we introduce (1 - windowSize) of latency
+					stft.analyse(inputBuffer.view(1 - windowSize))
+				});
+				// read as a MultiBuffer
+				auto result = stft.at(0);
+				++stft; // also moves the latest valid index
+			}
+
+			void processBlock(...) {
+				// assuming `historyLength` == blockSize
+				stft.ensureValid(blockSize, [&](int blockStartIndex) {
+					int inputStart = blockStartIndex + (1 - windowSize);
+					stft.analyse(inputBuffer.view(inputStart));
+				});
+				auto earliestValid = stft.at(0);
+				auto latestValid = stft.at(blockSize);
+				stft += blockSize;
+			}
+		\endcode
+		
+		The index passed to this functor will be greater than the previous valid index, and `<=` the index you pass in.  Therefore, if you call `.ensureValid()` every sample, it can only ever be `0`.
+	*/
+	template<typename Sample>
+	class STFT : public signalsmith::delay::MultiBuffer<Sample> {
+		using Super = signalsmith::delay::MultiBuffer<Sample>;
+		using Complex = std::complex<Sample>;
+
+		int channels = 0, _windowSize = 0, _fftSize = 0, _interval = 1;
+		int validUntilIndex = 0;
+
+		class MultiSpectrum {
+			int channels, stride;
+			std::vector<Complex> buffer;
+		public:
+			MultiSpectrum() : MultiSpectrum(0, 0) {}
+			MultiSpectrum(int channels, int bands) : channels(channels), stride(bands), buffer(channels*bands, 0) {}
+			
+			void resize(int nChannels, int nBands) {
+				channels = nChannels;
+				stride = nBands;
+				buffer.assign(channels*stride, 0);
+			}
+			
+			void reset() {
+				buffer.assign(buffer.size(), 0);
+			}
+			
+			void swap(MultiSpectrum &other) {
+				using std::swap;
+				swap(buffer, other.buffer);
+			}
+
+			Complex * operator [](int channel) {
+				return buffer.data() + channel*stride;
+			}
+			const Complex * operator [](int channel) const {
+				return buffer.data() + channel*stride;
+			}
+		};
+		std::vector<Sample> timeBuffer;
+
+		void resizeInternal(int newChannels, int windowSize, int newInterval, int historyLength, int zeroPadding) {
+			Super::resize(newChannels,
+				windowSize /* for output summing */
+				+ newInterval /* so we can read `windowSize` ahead (we'll be at most `interval-1` from the most recent block */
+				+ historyLength);
+
+			int fftSize = fft.fastSizeAbove(windowSize + zeroPadding);
+			
+			this->channels = newChannels;
+			_windowSize = windowSize;
+			this->_fftSize = fftSize;
+			this->_interval = newInterval;
+			validUntilIndex = -1;
+			
+			using Kaiser = ::signalsmith::windows::Kaiser;
+
+			/// Roughly optimal Kaiser for STFT analysis (forced to perfect reconstruction)
+			auto &window = fft.setSizeWindow(fftSize);
+			auto kaiser = Kaiser::withBandwidth(windowSize/(double)_interval, true);
+			kaiser.fill(window, windowSize);
+			::signalsmith::windows::forcePerfectReconstruction(window, windowSize, _interval);
+			
+			// TODO: fill extra bits of an input buffer with NaN/Infinity, to break this, and then fix by adding zero-padding to WindowedFFT (as opposed to zero-valued window sections)
+			for (int i = windowSize; i < fftSize; ++i) {
+				window[i] = 0;
+			}
+
+			spectrum.resize(channels, fftSize/2);
+			timeBuffer.resize(fftSize);
+		}
+	public:
+		using Spectrum = MultiSpectrum;
+		Spectrum spectrum;
+		WindowedFFT<Sample> fft;
+
+		STFT() {}
+		/// Parameters passed straight to `.resize()`
+		STFT(int channels, int windowSize, int interval, int historyLength=0, int zeroPadding=0) {
+			resize(channels, windowSize, interval, historyLength, zeroPadding);
+		}
+
+		/// Sets the channel-count, FFT size and interval.
+		void resize(int nChannels, int windowSize, int interval, int historyLength=0, int zeroPadding=0) {
+			resizeInternal(nChannels, windowSize, interval, historyLength, zeroPadding);
+		}
+		
+		int windowSize() const {
+			return _windowSize;
+		}
+		int fftSize() const {
+			return _fftSize;
+		}
+		int interval() const {
+			return _interval;
+		}
+		/// Returns the (analysis and synthesis) window
+		decltype(fft.window()) window() const {
+			return fft.window();
+		}
+		/// Calculates the effective window for the partially-summed future output (relative to the most recent block)
+		std::vector<Sample> partialSumWindow() const {
+			const auto &w = window();
+			std::vector<Sample> result(_windowSize, 0);
+			for (int offset = 0; offset < _windowSize; offset += _interval) {
+				for (int i = 0; i < _windowSize - offset; ++i) {
+					Sample value = w[i + offset];
+					result[i] += value*value;
+				}
+			}
+			return result;
+		}
+		
+		/// Resets everything - since we clear the output sum, it will take `windowSize` samples to get proper output.
+		void reset() {
+			Super::reset();
+			spectrum.reset();
+			validUntilIndex = -1;
+		}
+		
+		/** Generates valid output up to the specified index (or 0), using the callback as many times as needed.
+			
+			The callback should be a functor accepting a single integer argument, which is the index for which a spectrum is required.
+			
+			The block created from these spectra will start at this index in the output, plus `.latency()`.
+		*/
+		template<class AnalysisFn>
+		void ensureValid(int i, AnalysisFn fn) {
+			while (validUntilIndex < i) {
+				int blockIndex = validUntilIndex + 1;
+				fn(blockIndex);
+
+				auto output = this->view(blockIndex);
+				for (int c = 0; c < channels; ++c) {
+					auto channel = output[c];
+
+					// Clear out the future sum, a window-length and an interval ahead
+					for (int wi = _windowSize; wi < _windowSize + _interval; ++wi) {
+						channel[wi] = 0;
+					}
+
+					// Add in the IFFT'd result
+					fft.ifft(spectrum[c], timeBuffer);
+					for (int wi = 0; wi < _windowSize; ++wi) {
+						channel[wi] += timeBuffer[wi];
+					}
+				}
+				validUntilIndex += _interval;
+			}
+		}
+		/// The same as above, assuming index 0
+		template<class AnalysisFn>
+		void ensureValid(AnalysisFn fn) {
+			return ensureValid(0, fn);
+		}
+		
+		/** Analyse a multi-channel input, for any type where `data[channel][index]` returns samples
+ 
+		Results can be read/edited using `.spectrum`. */
+		template<class Data>
+		void analyse(Data &&data) {
+			for (int c = 0; c < channels; ++c) {
+				fft.fft(data[c], spectrum[c]);
+			}
+		}
+		template<class Data>
+		void analyse(int c, Data &&data) {
+			fft.fft(data, spectrum[c]);
+		}
+		/// Analyse without windowing
+		template<class Data>
+		void analyseRaw(Data &&data) {
+			for (int c = 0; c < channels; ++c) {
+				fft.fftRaw(data[c], spectrum[c]);
+			}
+		}
+		template<class Data>
+		void analyseRaw(int c, Data &&data) {
+			fft.fftRaw(data, spectrum[c]);
+		}
+
+		int bands() const {
+			return _fftSize/2;
+		}
+
+		/** Internal latency (between the block-index requested in `.ensureValid()` and its position in the output)
+ 
+		Currently unused, but it's in here to allow for a future implementation which spreads the FFT calculations out across each interval.*/
+		int latency() {
+			return 0;
+		}
+		
+		// @name Shift the underlying buffer (moving the "valid" index accordingly)
+		// @{
+		STFT & operator ++() {
+			Super::operator ++();
+			validUntilIndex--;
+			return *this;
+		}
+		STFT & operator +=(int i) {
+			Super::operator +=(i);
+			validUntilIndex -= i;
+			return *this;
+		}
+		STFT & operator --() {
+			Super::operator --();
+			validUntilIndex++;
+			return *this;
+		}
+		STFT & operator -=(int i) {
+			Super::operator -=(i);
+			validUntilIndex += i;
+			return *this;
+		}
+		// @}
+
+		typename Super::MutableView operator ++(int postIncrement) {
+			auto result = Super::operator ++(postIncrement);
+			validUntilIndex--;
+			return result;
+		}
+		typename Super::MutableView operator --(int postIncrement) {
+			auto result = Super::operator --(postIncrement);
+			validUntilIndex++;
+			return result;
+		}
+	};
+
+	/** STFT processing, with input/output.
+		Before calling `.ensureValid(index)`, you should make sure the input is filled up to `index`.
+	*/
+	template<typename Sample>
+	class ProcessSTFT : public STFT<Sample> {
+		using Super = STFT<Sample>;
+	public:
+		signalsmith::delay::MultiBuffer<Sample> input;
+	
+		ProcessSTFT(int inChannels, int outChannels, int windowSize, int interval, int historyLength=0) {
+			resize(inChannels, outChannels, windowSize, interval, historyLength);
+		}
+
+		/** Alter the spectrum, using input up to this point, for the output block starting from this point.
+			Sub-classes should replace this with whatever processing is desired. */
+		virtual void processSpectrum(int /*blockIndex*/) {}
+		
+		/// Sets the input/output channels, FFT size and interval.
+		void resize(int inChannels, int outChannels, int windowSize, int interval, int historyLength=0) {
+			Super::resize(outChannels, windowSize, interval, historyLength);
+			input.resize(inChannels, windowSize + interval + historyLength);
+		}
+		void reset(Sample value=Sample()) {
+			Super::reset(value);
+			input.reset(value);
+		}
+
+		/// Internal latency, including buffering samples for analysis.
+		int latency() {
+			return Super::latency() + (this->windowSize() - 1);
+		}
+		
+		void ensureValid(int i=0) {
+			Super::ensureValid(i, [&](int blockIndex) {
+				this->analyse(input.view(blockIndex - this->windowSize() + 1));
+				this->processSpectrum(blockIndex);
+			});
+		}
+
+		// @name Shift the output, input, and valid index.
+		// @{
+		ProcessSTFT & operator ++() {
+			Super::operator ++();
+			++input;
+			return *this;
+		}
+		ProcessSTFT & operator +=(int i) {
+			Super::operator +=(i);
+			input += i;
+			return *this;
+		}
+		ProcessSTFT & operator --() {
+			Super::operator --();
+			--input;
+			return *this;
+		}
+		ProcessSTFT & operator -=(int i) {
+			Super::operator -=(i);
+			input -= i;
+			return *this;
+		}
+		// @}
+	};
+
+/** @} */
+}} // signalsmith::spectral::
+#endif // include guard
diff --git a/dsp/windows.h b/dsp/windows.h
new file mode 100644
index 0000000..20f35a5
--- /dev/null
+++ b/dsp/windows.h
@@ -0,0 +1,185 @@
+#ifndef SIGNALSMITH_DSP_WINDOWS_H
+#define SIGNALSMITH_DSP_WINDOWS_H
+
+#include "./common.h"
+
+#include <cmath>
+
+namespace signalsmith {
+namespace windows {
+	/**	@defgroup Windows Window functions
+		@brief Windows for spectral analysis
+		
+		These are generally double-precision, because they are mostly calculated during setup/reconfiguring, not real-time code.
+		
+		@{
+		@file
+	*/
+	
+	/** @brief The Kaiser window (almost) maximises the energy in the main-lobe compared to the side-lobes.
+		
+		Kaiser windows can be constructing using the shape-parameter (beta) or using the static `with???()` methods.*/
+	class Kaiser {
+		// I_0(x)=\sum_{k=0}^{N}\frac{x^{2k}}{(k!)^2\cdot4^k}
+		inline static double bessel0(double x) {
+			const double significanceLimit = 1e-4;
+			double result = 0;
+			double term = 1;
+			double m = 0;
+			while (term > significanceLimit) {
+				result += term;
+				++m;
+				term *= (x*x)/(4*m*m);
+			}
+
+			return result;
+		}
+		double beta;
+		double invB0;
+		
+		static double heuristicBandwidth(double bandwidth) {
+			// Good peaks
+			//return bandwidth + 8/((bandwidth + 3)*(bandwidth + 3));
+			// Good average
+			//return bandwidth + 14/((bandwidth + 2.5)*(bandwidth + 2.5));
+			// Compromise
+			return bandwidth + 8/((bandwidth + 3)*(bandwidth + 3)) + 0.25*std::max(3 - bandwidth, 0.0);
+		}
+	public:
+		/// Set up a Kaiser window with a given shape.  `beta` is `pi*alpha` (since there is ambiguity about shape parameters)
+		Kaiser(double beta) : beta(beta), invB0(1/bessel0(beta)) {}
+
+		/// @name Bandwidth methods
+		/// @{
+		static Kaiser withBandwidth(double bandwidth, bool heuristicOptimal=false) {
+			return Kaiser(bandwidthToBeta(bandwidth, heuristicOptimal));
+		}
+
+		/** Returns the Kaiser shape where the main lobe has the specified bandwidth (as a factor of 1/window-length).
+		\diagram{kaiser-windows.svg,You can see that the main lobe matches the specified bandwidth.}
+		If `heuristicOptimal` is enabled, the main lobe width is _slightly_ wider, improving both the peak and total energy - see `bandwidthToEnergyDb()` and `bandwidthToPeakDb()`.
+		\diagram{kaiser-windows-heuristic.svg, The main lobe extends to ±bandwidth/2.} */
+		static double bandwidthToBeta(double bandwidth, bool heuristicOptimal=false) {
+			if (heuristicOptimal) { // Heuristic based on numerical search
+				bandwidth = heuristicBandwidth(bandwidth);
+			}
+			bandwidth = std::max(bandwidth, 2.0);
+			double alpha = std::sqrt(bandwidth*bandwidth*0.25 - 1);
+			return alpha*M_PI;
+		}
+		
+		static double betaToBandwidth(double beta) {
+			double alpha = beta*(1.0/M_PI);
+			return 2*std::sqrt(alpha*alpha + 1);
+		}
+		/// @}
+
+		/// @name Performance methods
+		/// @{
+		/** @brief Total energy ratio (in dB) between side-lobes and the main lobe.
+			\diagram{kaiser-bandwidth-sidelobes-energy.svg,Measured main/side lobe energy ratio.  You can see that the heuristic improves performance for all bandwidth values.}
+			This function uses an approximation which is accurate to ±0.5dB for 2 ⩽ bandwidth ≤ 10, or 1 ⩽ bandwidth ≤ 10 when `heuristicOptimal`is enabled.
+		*/
+		static double bandwidthToEnergyDb(double bandwidth, bool heuristicOptimal=false) {
+			// Horrible heuristic fits
+			if (heuristicOptimal) {
+				if (bandwidth < 3) bandwidth += (3 - bandwidth)*0.5;
+				return 12.9 + -3/(bandwidth + 0.4) - 13.4*bandwidth + (bandwidth < 3)*-9.6*(bandwidth - 3);
+			}
+			return 10.5 + 15/(bandwidth + 0.4) - 13.25*bandwidth + (bandwidth < 2)*13*(bandwidth - 2);
+		}
+		static double energyDbToBandwidth(double energyDb, bool heuristicOptimal=false) {
+			double bw = 1;
+			while (bw < 20 && bandwidthToEnergyDb(bw, heuristicOptimal) > energyDb) {
+				bw *= 2;
+			}
+			double step = bw/2;
+			while (step > 0.0001) {
+				if (bandwidthToEnergyDb(bw, heuristicOptimal) > energyDb) {
+					bw += step;
+				} else {
+					bw -= step;
+				}
+				step *= 0.5;
+			}
+			return bw;
+		}
+		/** @brief Peak ratio (in dB) between side-lobes and the main lobe.
+			\diagram{kaiser-bandwidth-sidelobes-peak.svg,Measured main/side lobe peak ratio.  You can see that the heuristic improves performance, except in the bandwidth range 1-2 where peak ratio was sacrificed to improve total energy ratio.}
+			This function uses an approximation which is accurate to ±0.5dB for 2 ⩽ bandwidth ≤ 9, or 0.5 ⩽ bandwidth ≤ 9 when `heuristicOptimal`is enabled.
+		*/
+		static double bandwidthToPeakDb(double bandwidth, bool heuristicOptimal=false) {
+			// Horrible heuristic fits
+			if (heuristicOptimal) {
+				return 14.2 - 20/(bandwidth + 1) - 13*bandwidth + (bandwidth < 3)*-6*(bandwidth - 3) + (bandwidth < 2.25)*5.8*(bandwidth - 2.25);
+			}
+			return 10 + 8/(bandwidth + 2) - 12.75*bandwidth + (bandwidth < 2)*4*(bandwidth - 2);
+		}
+		static double peakDbToBandwidth(double peakDb, bool heuristicOptimal=false) {
+			double bw = 1;
+			while (bw < 20 && bandwidthToPeakDb(bw, heuristicOptimal) > peakDb) {
+				bw *= 2;
+			}
+			double step = bw/2;
+			while (step > 0.0001) {
+				if (bandwidthToPeakDb(bw, heuristicOptimal) > peakDb) {
+					bw += step;
+				} else {
+					bw -= step;
+				}
+				step *= 0.5;
+			}
+			return bw;
+		}
+		/** @} */
+
+		/** Equivalent noise bandwidth (ENBW), a measure of frequency resolution.
+			\diagram{kaiser-bandwidth-enbw.svg,Measured ENBW, with and without the heuristic bandwidth adjustment.}
+			This approximation is accurate to ±0.05 up to a bandwidth of 22.
+		*/
+		static double bandwidthToEnbw(double bandwidth, bool heuristicOptimal=false) {
+			if (heuristicOptimal) bandwidth = heuristicBandwidth(bandwidth);
+			double b2 = std::max<double>(bandwidth - 2, 0);
+			return 1 + b2*(0.2 + b2*(-0.005 + b2*(-0.000005 + b2*0.0000022)));
+		}
+
+		/// Return the window's value for position in the range [0, 1]
+		double operator ()(double unit) {
+			double r = 2*unit - 1;
+			double arg = std::sqrt(1 - r*r);
+			return bessel0(beta*arg)*invB0;
+		}
+	
+		/// Fills an arbitrary container with a Kaiser window
+		template<typename Data>
+		void fill(Data &data, int size) const {
+			double invSize = 1.0/size;
+			for (int i = 0; i < size; ++i) {
+				double r = (2*i + 1)*invSize - 1;
+				double arg = std::sqrt(1 - r*r);
+				data[i] = bessel0(beta*arg)*invB0;
+			}
+		}
+	};
+
+	/** Forces STFT perfect-reconstruction (WOLA) on an existing window, for a given STFT interval.
+	For example, here are perfect-reconstruction versions of the approximately-optimal @ref Kaiser windows:
+	\diagram{kaiser-windows-heuristic-pr.svg,Note the lower overall energy\, and the pointy top for 2x bandwidth. Spectral performance is about the same\, though.}
+	*/
+	template<typename Data>
+	void forcePerfectReconstruction(Data &data, int windowLength, int interval) {
+		for (int i = 0; i < interval; ++i) {
+			double sum2 = 0;
+			for (int index = i; index < windowLength; index += interval) {
+				sum2 += data[index]*data[index];
+			}
+			double factor = 1/std::sqrt(sum2);
+			for (int index = i; index < windowLength; index += interval) {
+				data[index] *= factor;
+			}
+		}
+	}
+
+/** @} */
+}} // signalsmith::windows
+#endif // include guard
diff --git a/signalsmith-stretch.h b/signalsmith-stretch.h
new file mode 100644
index 0000000..6f4b3f1
--- /dev/null
+++ b/signalsmith-stretch.h
@@ -0,0 +1,412 @@
+#ifndef SIGNALSMITH_STRETCH_H
+#define SIGNALSMITH_STRETCH_H
+
+#include "dsp/spectral.h"
+#include "dsp/delay.h"
+#include "dsp/curves.h"
+#include <vector>
+#include <functional>
+#include <algorithm>
+
+namespace signalsmith { namespace stretch {
+
+template<typename Sample=float>
+struct SignalsmithStretch {
+
+	int blockSamples() const {
+		return stft.windowSize();
+	}
+	int intervalSamples() const {
+		return stft.interval();
+	}
+	int inputLatencySamples() const {
+		return stft.windowSize()/2;
+	}
+	int outputLatencySamples() const {
+		return stft.windowSize() - inputLatencySamples();
+	}
+	
+	void reset() {
+		stft.reset();
+		inputBuffer.reset();
+		prevInputOffset = -1;
+		channelBands.assign(channelBands.size(), Band());
+	}
+
+	/// Configures using a default preset
+	void presetDefault(int nChannels, Sample sampleRate) {
+		configure(nChannels, sampleRate*0.12, sampleRate*0.03);
+		freqWeight = 1;
+		timeWeight = 2;
+		channelWeight = 0.5;
+	}
+
+	/// Manual setup
+	Sample freqWeight = 1, timeWeight = 2, channelWeight = 0.5;
+	void configure(int nChannels, int blockSamples, int intervalSamples) {
+		channels = nChannels;
+		stft.resize(channels, blockSamples, intervalSamples);
+		inputBuffer.resize(channels, blockSamples);
+		timeBuffer.assign(stft.fftSize(), 0);
+		channelBands.assign(stft.bands()*channels, Band());
+		
+		// Various phase rotations
+		rotCentreSpectrum.resize(stft.bands());
+		rotPrevInput.assign(stft.bands(), 0);
+		rotPrevOutput.resize(stft.bands());
+		timeShiftPhases(blockSamples*Sample(-0.5), rotCentreSpectrum);
+		timeShiftPhases(-intervalSamples, rotPrevOutput);
+		peaks.reserve(stft.bands());
+		smoothedEnergy.resize(stft.bands());
+		inputBinMap.resize(stft.bands());
+		outputGainMap.resize(stft.bands());
+	}
+	
+	template<class Inputs, class Outputs>
+	void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
+		Sample timeScaling = Sample(inputSamples)/outputSamples;
+
+		for (int outputIndex = 0; outputIndex < outputSamples; ++outputIndex) {
+			stft.ensureValid(outputIndex, [&](int outputOffset) {
+				// Time to process a spectrum!  Where should it come from in the input?
+				int inputOffset = (outputOffset*inputSamples)/outputSamples - stft.windowSize();
+				int inputInterval = inputOffset - prevInputOffset;
+				prevInputOffset = inputOffset;
+
+				if (inputInterval > 0) {
+					timeShiftPhases(-inputInterval, rotPrevInput);
+					for (int c = 0; c < channels; ++c) {
+						// Copy from the history buffer, if needed
+						auto &&bufferChannel = inputBuffer[c];
+						for (int i = 0; i < -inputOffset; ++i) {
+							timeBuffer[i] = bufferChannel[i + inputOffset];
+						}
+						// Copy the rest from the input
+						auto &&inputChannel = inputs[c];
+						for (int i = std::max<int>(0, -inputOffset); i < stft.windowSize(); ++i) {
+							timeBuffer[i] = inputChannel[i + inputOffset];
+						}
+		
+						stft.analyse(c, timeBuffer);
+					}
+				}
+				
+				for (int c = 0; c < channels; ++c) {
+					auto bands = bandsForChannel(c);
+					auto &&spectrumBands = stft.spectrum[c];
+					for (int b = 0; b < stft.bands(); ++b) {
+						bands[b].input = spectrumBands[b]*rotCentreSpectrum[b];
+					}
+				}
+				
+				processSpectrum(inputInterval);
+
+				for (int c = 0; c < channels; ++c) {
+					auto bands = bandsForChannel(c);
+					auto &&spectrumBands = stft.spectrum[c];
+					for (int b = 0; b < stft.bands(); ++b) {
+						spectrumBands[b] = bands[b].output*std::conj(rotCentreSpectrum[b]);
+					}
+				}
+			});
+
+			for (int c = 0; c < channels; ++c) {
+				auto &&outputChannel = outputs[c];
+				auto &&stftChannel = stft[c];
+				outputChannel[outputIndex] = stftChannel[outputIndex];
+			}
+		}
+
+		// Store input in history buffer
+		for (int c = 0; c < channels; ++c) {
+			auto &&inputChannel = inputs[c];
+			auto &&bufferChannel = inputBuffer[c];
+			int startIndex = std::max<int>(0, inputSamples - stft.windowSize());
+			for (int i = startIndex; i < inputSamples; ++i) {
+				bufferChannel[i] = inputChannel[i];
+			}
+		}
+		inputBuffer += inputSamples;
+		stft += outputSamples;
+		prevInputOffset -= inputSamples;
+	}
+	
+	/// Frequency multiplier, and optional tonality limit (as multiple of sample-rate)
+	void setTransposeFactor(Sample multiplier, Sample tonalityLimit=0) {
+		freqMultiplier = multiplier;
+		if (tonalityLimit > 0) {
+			freqTonalityLimit = tonalityLimit/std::sqrt(multiplier); // compromise between input and output limits
+		} else {
+			freqTonalityLimit = 1;
+		}
+	}
+	void setTransposeSemitones(Sample semitones, Sample tonalityLimit=0) {
+		setTransposeFactor(std::pow(2, semitones/12), tonalityLimit);
+	}
+
+	struct Peak {
+		Sample input, output, energy;
+		
+		bool operator< (const Peak &other) const {
+			return output < other.output;
+		}
+	};
+	std::vector<Peak> peaks;
+	/// This function is called once per channel, from inside `.process()`, so that you can alter the mapping in `.peaks`
+	void setMap(std::function<void(int)> freqMap) {
+		frequencyMapFn = freqMap;
+	}
+	
+private:
+	using Complex = std::complex<Sample>;
+	Sample freqMultiplier = 1, freqTonalityLimit = 0.5;
+	std::function<void(int)> frequencyMapFn;
+
+	signalsmith::spectral::STFT<Sample> stft{0, 1, 1};
+	signalsmith::delay::MultiBuffer<Sample> inputBuffer;
+	int channels = 0;
+	int prevInputOffset = -1;
+	std::vector<Sample> timeBuffer;
+
+	std::vector<Complex> rotCentreSpectrum, rotPrevOutput, rotPrevInput;
+	Sample bandToFreq(int b) const {
+		return (b + Sample(0.5))/stft.fftSize();
+	}
+	void timeShiftPhases(Sample shiftSamples, std::vector<Complex> &output) const {
+		for (int b = 0; b < stft.bands(); ++b) {
+			Sample phase = bandToFreq(b)*shiftSamples*Sample(-2*M_PI);
+			output[b] = {std::cos(phase), std::sin(phase)};
+		}
+	}
+	
+	struct Band {
+		Complex input, prevInput{0};
+		Complex output, prevOutput{0};
+		Complex timeChange{0};
+		Sample energy, prevEnergy;
+	};
+	std::vector<Band> channelBands;
+	Band * bandsForChannel(int channel) {
+		return channelBands.data() + channel*stft.bands();
+	}
+	template<Complex Band::*member>
+	Complex getBand(int channel, int index) {
+		if (index >= stft.bands()) return 0;
+		if (index < 0) {
+			return std::conj(getBand<member>(channel, -1 - index));
+		}
+		return channelBands[index + channel*stft.bands()].*member;
+	}
+	template<Complex Band::*member>
+	Complex getFractional(int channel, int lowIndex, Sample fractional) {
+		Complex low = getBand<member>(channel, lowIndex);
+		Complex high = getBand<member>(channel, lowIndex + 1);
+		return low + (high - low)*fractional;
+	}
+	template<Sample Band::*member>
+	Sample getBand(int channel, int index) {
+		if (index < 0) index = -1 - index;
+		if (index >= stft.bands()) return 0;
+		return channelBands[index + channel*stft.bands()].*member;
+	}
+	template<Sample Band::*member>
+	Sample getFractional(int channel, int lowIndex, Sample fractional) {
+		Sample low = getBand<member>(channel, lowIndex);
+		Sample high = getBand<member>(channel, lowIndex + 1);
+		return low + (high - low)*fractional;
+	}
+
+	Sample peakThreshold = 1;
+	std::vector<Sample> smoothedEnergy, inputBinMap, outputGainMap;
+
+	void processSpectrum(int inputInterval) {
+		int outputInterval = stft.interval();
+		int bands = stft.bands();
+		
+		Sample rate = Sample(inputInterval)/outputInterval;
+		
+		if (inputInterval > 0) {
+			for (int c = 0; c < channels; ++c) {
+				auto bins = bandsForChannel(c);
+				for (int b = 0; b < stft.bands(); ++b) {
+					auto &bin = bins[b];
+					bins[b].prevOutput *= rotPrevOutput[b];
+					bins[b].prevInput *= rotPrevInput[b];
+				}
+			}
+		}
+
+		Sample smoothingBins = Sample(stft.fftSize())/stft.interval();
+		Band *bins0 = bandsForChannel(0);
+		for (int c = 0; c < channels; ++c) {
+			Band *bins = bandsForChannel(c);
+
+			findPeaks(bins, smoothingBins);
+			if (frequencyMapFn) frequencyMapFn(c);
+	
+			// Scale so they map bins, not frequency
+			for (auto &p : peaks) {
+				p.input *= stft.fftSize();
+				p.output *= stft.fftSize();
+			}
+			// Create the input/output bin map
+			updateBinMap(smoothingBins);
+			
+			for (int b = 0; b < stft.bands(); ++b) {
+				Sample inputIndex = inputBinMap[b];
+				int lowIndex = std::floor(inputIndex);
+				Sample fracIndex = inputIndex - std::floor(inputIndex);
+				Sample outputEnergy = getFractional<&Band::energy>(c, lowIndex, fracIndex);
+				
+				Band &outputBin = bins[b];
+				Complex input = getFractional<&Band::input>(c, lowIndex, fracIndex);
+				Complex prevInput = getFractional<&Band::prevInput>(c, lowIndex, fracIndex);
+				Complex timeChange = input*std::conj(prevInput);
+
+				Complex prediction = outputBin.prevOutput*timeChange*freqWeight;
+
+				if (b > 0) {
+					Sample downIndex = inputIndex - rate;
+					int downLowIndex = std::floor(downIndex);
+					Sample fracDownIndex = downIndex - std::floor(downIndex);
+					Complex downInput = getFractional<&Band::input>(c, downLowIndex, fracDownIndex);
+					Complex freqChange = input*std::conj(downInput);
+					Complex outputDown = bins[b - 1].output;
+					prediction += outputDown*freqChange*timeWeight;
+				}
+				int longStep = std::round(smoothingBins);
+				if (b > longStep) {
+					Sample downIndex = inputIndex - longStep*rate;
+					int downLowIndex = std::floor(downIndex);
+					Sample fracDownIndex = downIndex - std::floor(downIndex);
+					Complex downInput = getFractional<&Band::input>(c, downLowIndex, fracDownIndex);
+					Complex freqChange = input*std::conj(downInput);
+					Complex outputDown = bins[b - longStep].output;
+					prediction += outputDown*freqChange*timeWeight;
+				}
+				
+				if (c > 0) {
+					Complex ch0Input = getFractional<&Band::input>(0, lowIndex, fracIndex);
+					Complex ch0Output = bins0[b].output;
+					Complex channelRot = input*std::conj(ch0Input);
+					prediction += ch0Output*channelRot*channelWeight;
+				}
+				
+				Sample predictionNorm = std::norm(prediction);
+				if (predictionNorm > 1e-15) {
+					outputBin.output = prediction*std::sqrt(outputEnergy/predictionNorm);
+				} else {
+					outputBin.output = input;
+				}
+				outputBin.output *= outputGainMap[b];
+			}
+		}
+
+		for (auto &bin : channelBands) {
+			bin.prevOutput = bin.output;
+			bin.prevInput = bin.input;
+			bin.prevEnergy = bin.energy;
+		}
+	}
+	
+	void smoothEnergy(Band *bins, Sample smoothingBins) {
+		Sample smoothingSlew = 1/(1 + smoothingBins*Sample(0.5));
+		for (int b = 0; b < stft.bands(); ++b) {
+			auto &bin = bins[b];
+			Sample e = std::norm(bin.input);
+			bin.energy = e;
+			smoothedEnergy[b] = e*peakThreshold;
+		}
+		Sample e = 0;
+		for (int repeat = 0; repeat < 2; ++repeat) {
+			for (int b = stft.bands() - 1; b >= 0; --b) {
+				auto &bin = bins[b];
+				e += (smoothedEnergy[b] - e)*smoothingSlew;
+				smoothedEnergy[b] = e;
+			}
+			for (int b = 0; b < stft.bands(); ++b) {
+				auto &bin = bins[b];
+				e += (smoothedEnergy[b] - e)*smoothingSlew;
+				smoothedEnergy[b] = e;
+			}
+		}
+	}
+	
+	Sample defaultFreqMap(Sample freq) const {
+		if (freq > freqTonalityLimit) {
+			Sample diff = freq - freqTonalityLimit;
+			return freqTonalityLimit*freqMultiplier + diff;
+		}
+		return freq*freqMultiplier;
+	}
+	
+	void findPeaks(Band *bins, Sample smoothingBins) {
+		smoothEnergy(bins, smoothingBins);
+
+		peaks.resize(0);
+		// Artificial peak at 0
+		peaks.emplace_back(Peak{0, 0, 0});
+		
+		int start = 0;
+		while (start < stft.bands()) {
+			if (bins[start].energy > smoothedEnergy[start]) {
+				int end = start + 1;
+				while (end < stft.bands() && bins[end].energy > smoothedEnergy[end]) {
+					++end;
+				}
+				// Take the average frequency and energy across the peak range
+				Sample freqSum = 0, energySum = 0;
+				for (int b = start; b < end; ++b) {
+					Sample e = bins[b].energy;
+					freqSum += (b + 0.5)*e;
+					energySum += e;
+				}
+				Sample avgFreq = freqSum/(stft.fftSize()*energySum);
+				Sample avgEnergy = energySum/(end - start);
+				peaks.emplace_back(Peak{avgFreq, defaultFreqMap(avgFreq), avgEnergy});
+
+				start = end;
+			}
+			++start;
+		}
+		// Artificial peak at Nyquist
+		peaks.emplace_back(Peak{0.5, defaultFreqMap(freqMultiplier), 0});
+	}
+	
+	void updateBinMap(Sample peakWidthBins) {
+		std::stable_sort(peaks.begin(), peaks.end());
+		Sample linearZoneBins = peakWidthBins*Sample(0.5);
+		for (auto &g : outputGainMap) g = 1; // reset gains
+		for (int b = 0; b < std::min<int>(stft.bands(), peaks[0].output); ++b) {
+			inputBinMap[b] = peaks[0].input;
+			outputGainMap[b] = 0;
+		}
+		for (size_t p = 1; p < peaks.size(); ++p) {
+			const Peak &prev = peaks[p - 1], &next = peaks[p];
+			Sample prevEnd = prev.output + linearZoneBins;
+			Sample nextStart = next.output - linearZoneBins;
+			signalsmith::curves::Linear<Sample> segment(prevEnd, nextStart, prev.input + linearZoneBins, next.input - linearZoneBins);
+
+			if (nextStart < prevEnd) nextStart = prevEnd = (nextStart + prevEnd)*Sample(0.5);
+			prevEnd = std::max<Sample>(0, std::min<Sample>(stft.bands(), prevEnd));
+			nextStart = std::max<Sample>(0, std::min<Sample>(stft.bands(), nextStart));
+
+			for (int b = std::max<int>(0, std::ceil(prev.output)); b < prevEnd; ++b) {
+				inputBinMap[b] = b + prev.input - prev.output;
+			}
+			for (int b = std::ceil(prevEnd); b < nextStart; ++b) {
+				inputBinMap[b] = segment(b);
+			}
+			for (int b = std::ceil(nextStart); b < std::min<int>(stft.bands(), std::ceil(next.output)); ++b) {
+				inputBinMap[b] = b + next.input - next.output;
+			}
+		}
+		for (int b = std::max<int>(0, peaks.back().output); b < stft.bands(); ++b) {
+			inputBinMap[b] = peaks.back().input;
+			outputGainMap[b] = 0;
+		}
+	}
+};
+
+}} // namespace
+#endif // include guard