diff --git a/dsp/README.md b/dsp/README.md
index d0c2840..8039b3e 100644
--- a/dsp/README.md
+++ b/dsp/README.md
@@ -22,7 +22,7 @@ Delay delayLine(1024);
 You can add a compile-time version-check to make sure you have a compatible version of the library:
 ```cpp
 #include "dsp/envelopes.h"
-SIGNALSMITH_DSP_VERSION_CHECK(1, 3, 3)
+SIGNALSMITH_DSP_VERSION_CHECK(1, 4, 4)
 ```
 
 ### Development / contributing
diff --git a/dsp/common.h b/dsp/common.h
index a1963e6..68af091 100644
--- a/dsp/common.h
+++ b/dsp/common.h
@@ -14,13 +14,13 @@ namespace signalsmith {
 	*/
 
 #define SIGNALSMITH_DSP_VERSION_MAJOR 1
-#define SIGNALSMITH_DSP_VERSION_MINOR 3
-#define SIGNALSMITH_DSP_VERSION_PATCH 3
-#define SIGNALSMITH_DSP_VERSION_STRING "1.3.3"
+#define SIGNALSMITH_DSP_VERSION_MINOR 4
+#define SIGNALSMITH_DSP_VERSION_PATCH 4
+#define SIGNALSMITH_DSP_VERSION_STRING "1.4.4"
 
 	/** Version compatability check.
 	\code{.cpp}
-		static_assert(signalsmith::version(1, 0, 0), "version check");
+		static_assert(signalsmith::version(1, 4, 1), "version check");
 	\endcode
 	... or use the equivalent `SIGNALSMITH_DSP_VERSION_CHECK`.
 	Major versions are not compatible with each other.  Minor and patch versions are backwards-compatible.
@@ -37,4 +37,7 @@ namespace signalsmith {
 
 /** @} */
 } // signalsmith::
+#else
+// If we've already included it, check it's the same version
+static_assert(SIGNALSMITH_DSP_VERSION_MAJOR == 1 && SIGNALSMITH_DSP_VERSION_MINOR == 4 && SIGNALSMITH_DSP_VERSION_PATCH == 4, "multiple versions of the Signalsmith DSP library");
 #endif // include guard
diff --git a/dsp/curves.h b/dsp/curves.h
index 934dd29..dad132a 100644
--- a/dsp/curves.h
+++ b/dsp/curves.h
@@ -1,8 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_CURVES_H
 #define SIGNALSMITH_DSP_CURVES_H
 
-#include "./common.h"
-
 #include <vector>
 #include <algorithm> // std::stable_sort
 
@@ -29,6 +29,10 @@ namespace curves {
 			return a0 + x*a1;
 		}
 		
+		Sample dx() const {
+			return a1;
+		}
+		
 		/// Returns the inverse map (with some numerical error)
 		Linear inverse() const {
 			Sample invA1 = 1/a1;
@@ -178,16 +182,33 @@ namespace curves {
 		}
 		
 		/// Reads a value out from the curve.
-		Sample operator ()(Sample x) const {
+		Sample operator()(Sample x) const {
 			if (x <= first.x) return first.y;
 			if (x >= last.x) return last.y;
-			size_t index = 1;
-			while (index < _segments.size() && _segments[index].start() <= x) {
-				++index;
+			
+			// Binary search
+			size_t low = 0, high = _segments.size();
+			while (true) {
+				size_t mid = (low + high)/2;
+				if (low == mid) break;
+				if (_segments[mid].start() <= x) {
+					low = mid;
+				} else {
+					high = mid;
+				}
 			}
-			return _segments[index - 1](x);
+			return _segments[low](x);
 		}
 		
+		CubicSegmentCurve dx() const {
+			CubicSegmentCurve result{*this};
+			result.first.y = result.last.y = 0;
+			for (auto &s : result._segments) {
+				s = s.dx();
+			}
+			return result;
+		}
+
 		using Segment = Cubic<Sample>;
 		const std::vector<Segment> & segments() const {
 			return _segments;
diff --git a/dsp/delay.h b/dsp/delay.h
index f52113f..04c5631 100644
--- a/dsp/delay.h
+++ b/dsp/delay.h
@@ -1,8 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_DELAY_H
 #define SIGNALSMITH_DSP_DELAY_H
 
-#include "./common.h"
-
 #include <vector>
 #include <array>
 #include <cmath> // for std::ceil()
@@ -484,7 +484,8 @@ namespace delay {
 			subSampleSteps = 2*n; // Heuristic again.  Really it depends on the bandwidth as well.
 			double kaiserBandwidth = (stopFreq - passFreq)*(n + 1.0/subSampleSteps);
 			kaiserBandwidth += 1.25/kaiserBandwidth; // We want to place the first zero, but (because using this to window a sinc essentially integrates it in the freq-domain), our ripples (and therefore zeroes) are out of phase.  This is a heuristic fix.
-			
+			double sincScale = M_PI*(passFreq + stopFreq);
+
 			double centreIndex = n*subSampleSteps*0.5, scaleFactor = 1.0/subSampleSteps;
 			std::vector<Sample> windowedSinc(subSampleSteps*n + 1);
 			
@@ -497,7 +498,7 @@ namespace delay {
 					// Exact 0s
 					windowedSinc[i] = 0;
 				} else if (std::abs(x) > 1e-6) {
-					double p = x*M_PI;
+					double p = x*sincScale;
 					windowedSinc[i] *= std::sin(p)/p;
 				}
 			}
diff --git a/dsp/envelopes.h b/dsp/envelopes.h
index a833c51..2bc4831 100644
--- a/dsp/envelopes.h
+++ b/dsp/envelopes.h
@@ -1,8 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_ENVELOPES_H
 #define SIGNALSMITH_DSP_ENVELOPES_H
 
-#include "./common.h"
-
 #include <cmath>
 #include <random>
 #include <vector>
@@ -518,6 +518,77 @@ namespace envelopes {
 		}
 	};
 
+	/** Rolling Statistics
+		\diagram{envelopes-order-statistics}
+		
+		This keeps a history of recent samples
+	*/
+	template<typename Sample>
+	class RollingStats {
+		size_t index = 0;
+		std::vector<Sample> buffer, sorted;
+		
+		struct SkipListEntry {
+			Sample value;
+			int distance;
+			int nextEntry;
+			int finerEntry;
+			
+			int findBefore(const Sample &v, int thisIndex) const {
+				if (nextEntry == -1) return thisIndex;
+				SkipListEntry &next = skipList[nextEntry];
+				if (
+			}
+		};
+		std::vector<SkipListEntry> skipList;
+		const int findBefore(const Sample &v) const {
+			if (skipList.empty() || skipList[0].value >= v) return -1;
+
+			return skipList[0].findBefore(v, 0);
+		}
+		void addEntry(const Sample &v) {
+			if (skipList.empty()) {
+				skipList.push_back({v, 1, -1, -1});
+				return;
+			}
+			SkipListEntry *entry = &skipList[0];
+			while (entry) {
+				if (
+			}
+		}
+		
+	public:
+		RollingStats(int length, Sample initial=Sample()) {
+			buffer.assign(length, initial);
+			sorted.assign(length, initial);
+
+			for (int i = 0; i < length; ++i) {
+				addEntry(initial);
+			}
+		}
+		
+		void reset(Sample initial=Sample()) {
+			buffer.assign(buffer.size(), initial);
+			sorted.assign(buffer.size(), initial);
+			index = 0;
+		}
+		
+		void operator()(const Sample &v) {
+			buffer[index] = v;
+			sorted = buffer;
+			std::sort(sorted.begin(), sorted.end());
+			if (++index >= buffer.size()) index = 0;
+		}
+		
+		const Sample & operator[](int index) const {
+			return sorted[index];
+		}
+		
+		Sample percentile(double percentile) {
+			return sorted[(sorted.size() - 1e-10)*percentile];
+		}
+	};
+	
 /** @} */
 }} // signalsmith::envelopes::
 #endif // include guard
diff --git a/dsp/fft.h b/dsp/fft.h
index fcaf4f0..74c085a 100644
--- a/dsp/fft.h
+++ b/dsp/fft.h
@@ -1,7 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_FFT_V5
 #define SIGNALSMITH_FFT_V5
 
-#include "./common.h"
 #include "./perf.h"
 
 #include <vector>
diff --git a/dsp/filters.h b/dsp/filters.h
index 9545062..6e6a3c6 100644
--- a/dsp/filters.h
+++ b/dsp/filters.h
@@ -1,7 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_FILTERS_H
 #define SIGNALSMITH_DSP_FILTERS_H
 
-#include "./common.h"
 #include "./perf.h"
 
 #include <cmath>
@@ -54,11 +55,10 @@ namespace filters {
 			double w0, sinW0, cosW0;
 			double inv2Q;
 			
-			FreqSpec(double scaledFreq, BiquadDesign design) {
-				this->scaledFreq = scaledFreq = std::max(1e-6, std::min(0.4999, scaledFreq));
+			FreqSpec(double freq, BiquadDesign design) {
+				scaledFreq = std::max(1e-6, std::min(0.4999, freq));
 				if (design == BiquadDesign::cookbook) {
-					// Falls apart a bit near Nyquist
-					this->scaledFreq = scaledFreq = std::min(0.45, scaledFreq);
+					scaledFreq = std::min(0.45, scaledFreq);
 				}
 				w0 = 2*M_PI*scaledFreq;
 				cosW0 = std::cos(w0);
@@ -107,21 +107,22 @@ namespace filters {
 				double Q = (type == Type::peak ? 0.5*sqrtGain : 0.5)/calc.inv2Q;
 				double q = (type == Type::peak ? 1/sqrtGain : 1)*calc.inv2Q;
 				double expmqw = std::exp(-q*w0);
+				double da1, da2;
 				if (q <= 1) {
-					a1 = -2*expmqw*std::cos(std::sqrt(1 - q*q)*w0);
+					a1 = da1 = -2*expmqw*std::cos(std::sqrt(1 - q*q)*w0);
 				} else {
-					a1 = -2*expmqw*std::cosh(std::sqrt(q*q - 1)*w0);
+					a1 = da1 = -2*expmqw*std::cosh(std::sqrt(q*q - 1)*w0);
 				}
-				a2 = expmqw*expmqw;
+				a2 = da2 = expmqw*expmqw;
 				double sinpd2 = std::sin(w0/2);
 				double p0 = 1 - sinpd2*sinpd2, p1 = sinpd2*sinpd2, p2 = 4*p0*p1;
-				double A0 = 1 + a1 + a2, A1 = 1 - a1 + a2, A2 = -4*a2;
+				double A0 = 1 + da1 + da2, A1 = 1 - da1 + da2, A2 = -4*da2;
 				A0 *= A0;
 				A1 *= A1;
 				if (type == Type::lowpass) {
 					double R1 = (A0*p0 + A1*p1 + A2*p2)*Q*Q;
 					double B0 = A0, B1 = (R1 - B0*p0)/p1;
-					b0 = 0.5*(std::sqrt(B0) + std::sqrt(B1));
+					b0 = 0.5*(std::sqrt(B0) + std::sqrt(std::max(0.0, B1)));
 					b1 = std::sqrt(B0) - b0;
 					b2 = 0;
 					return *this;
@@ -134,17 +135,18 @@ namespace filters {
 					double R2 = -A0 + A1 + 4*(p0 - p1)*A2;
 					double B2 = (R1 - R2*p1)/(4*p1*p1);
 					double B1 = R2 + 4*(p1 - p0)*B2;
-					b1 = -0.5*std::sqrt(B1);
-					b0 = 0.5*(std::sqrt(B2 + 0.25*B1) - b1);
+					b1 = -0.5*std::sqrt(std::max(0.0, B1));
+					b0 = 0.5*(std::sqrt(std::max(0.0, B2 + 0.25*B1)) - b1);
 					b2 = -b0 - b1;
 					return *this;
 				} else if (type == Type::notch) {
 					// The Vicanek paper doesn't cover notches (band-stop), but we know where the zeros should be:
 					b0 = 1;
-					b1 = -2*std::cos(w0);
+					double db1 = -2*std::cos(w0); // might be higher precision
+					b1 = db1;
 					b2 = 1;
 					// Scale so that B0 == A0 to get 0dB at f=0
-					double scale = std::sqrt(A0)/(b0 + b1 + b2);
+					double scale = std::sqrt(A0)/(b0 + db1 + b2);
 					b0 *= scale;
 					b1 *= scale;
 					b2 *= scale;
@@ -156,9 +158,9 @@ namespace filters {
 					double B0 = A0;
 					double B2 = (R1 - R2*p1 - B0)/(4*p1*p1);
 					double B1 = R2 + B0 + 4*(p1 - p0)*B2;
-					double W = 0.5*(std::sqrt(B0) + std::sqrt(B1));
-					b0 = 0.5*(W + std::sqrt(W*W + B2));
-					b1 = 0.5*(std::sqrt(B0) - std::sqrt(B1));
+					double W = 0.5*(std::sqrt(B0) + std::sqrt(std::max(0.0, B1)));
+					b0 = 0.5*(W + std::sqrt(std::max(0.0, W*W + B2)));
+					b1 = 0.5*(std::sqrt(B0) - std::sqrt(std::max(0.0, B1)));
 					b2 = -B2/(4*b0);
 					return *this;
 				}
diff --git a/dsp/mix.h b/dsp/mix.h
index 5b85a3c..c4b25b1 100644
--- a/dsp/mix.h
+++ b/dsp/mix.h
@@ -1,8 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_MULTI_CHANNEL_H
 #define SIGNALSMITH_DSP_MULTI_CHANNEL_H
 
-#include "./common.h"
-
 #include <array>
 
 namespace signalsmith {
diff --git a/dsp/perf.h b/dsp/perf.h
index ab72821..779c7b7 100644
--- a/dsp/perf.h
+++ b/dsp/perf.h
@@ -1,8 +1,16 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_PERF_H
 #define SIGNALSMITH_DSP_PERF_H
 
 #include <complex>
 
+#if defined(__SSE__) || defined(_M_X64)
+#	include <xmmintrin.h>
+#else
+#	include <cstdint> // for uintptr_t
+#endif
+
 namespace signalsmith {
 namespace perf {
 	/**	@defgroup Performance Performance helpers
@@ -24,7 +32,7 @@ namespace perf {
 	#endif
 
 	/** @brief Complex-multiplication (with optional conjugate second-arg), without handling NaN/Infinity
-		The `std::complex` multiplication has edge-cases around NaNs which slow things down and prevent auto-vectorisation.
+		The `std::complex` multiplication has edge-cases around NaNs which slow things down and prevent auto-vectorisation.  Flags like `-ffast-math` sort this out anyway, but this helps with Debug builds.
 	*/
 	template <bool conjugateSecond=false, typename V>
 	SIGNALSMITH_INLINE static std::complex<V> mul(const std::complex<V> &a, const std::complex<V> &b) {
@@ -37,6 +45,39 @@ namespace perf {
 		};
 	}
 
+#if defined(__SSE__) || defined(_M_X64)
+	class StopDenormals {
+		unsigned int controlStatusRegister;
+	public:
+		StopDenormals() : controlStatusRegister(_mm_getcsr()) {
+			_mm_setcsr(controlStatusRegister|0x8040); // Flush-to-Zero and Denormals-Are-Zero
+		}
+		~StopDenormals() {
+			_mm_setcsr(controlStatusRegister);
+		}
+	};
+#elif (defined (__ARM_NEON) || defined (__ARM_NEON__))
+	class StopDenormals {
+		uintptr_t status;
+	public:
+		StopDenormals() {
+			uintptr_t asmStatus;
+			asm volatile("mrs %0, fpcr" : "=r"(asmStatus));
+			status = asmStatus = asmStatus|0x01000000U; // Flush to Zero
+			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
+		}
+		~StopDenormals() {
+			uintptr_t asmStatus = status;
+			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
+		}
+	};
+#else
+#	if __cplusplus >= 202302L
+# 		warning "The `StopDenormals` class doesn't do anything for this architecture"
+#	endif
+	class StopDenormals {}; // FIXME: add for other architectures
+#endif
+
 /** @} */
 }} // signalsmith::perf::
 
diff --git a/dsp/rates.h b/dsp/rates.h
new file mode 100644
index 0000000..b5da8c7
--- /dev/null
+++ b/dsp/rates.h
@@ -0,0 +1,184 @@
+#include "./common.h"
+
+#ifndef SIGNALSMITH_DSP_RATES_H
+#define SIGNALSMITH_DSP_RATES_H
+
+#include "./windows.h"
+#include "./delay.h"
+
+namespace signalsmith {
+namespace rates {
+	/**	@defgroup Rates Multi-rate processing
+		@brief Classes for oversampling/upsampling/downsampling etc.
+
+		@{
+		@file
+	*/
+	
+	/// @brief Fills a container with a Kaiser-windowed sinc for an FIR lowpass.
+	/// \diagram{rates-kaiser-sinc.svg,33-point results for various pass/stop frequencies}
+	template<class Data>
+	void fillKaiserSinc(Data &&data, int length, double passFreq, double stopFreq) {
+		if (length <= 0) return;
+		double kaiserBandwidth = (stopFreq - passFreq)*length;
+		kaiserBandwidth += 1.25/kaiserBandwidth; // heuristic for transition band, see `InterpolatorKaiserSincN`
+		auto kaiser = signalsmith::windows::Kaiser::withBandwidth(kaiserBandwidth);
+		kaiser.fill(data, length);
+
+		double centreIndex = (length - 1)*0.5;
+		double sincScale = M_PI*(passFreq + stopFreq);
+		double ampScale = (passFreq + stopFreq);
+		for (int i = 0; i < length; ++i) {
+			double x = (i - centreIndex), px = x*sincScale;
+			double sinc = (std::abs(px) > 1e-6) ? std::sin(px)*ampScale/px : ampScale;
+			data[i] *= sinc;
+		}
+	};
+	/// @brief If only the centre frequency is specified, a heuristic is used to balance ripples and transition width
+	/// \diagram{rates-kaiser-sinc-heuristic.svg,The transition width is set to: 0.9/sqrt(length)}
+	template<class Data>
+	void fillKaiserSinc(Data &&data, int length, double centreFreq) {
+		double halfWidth = 0.45/std::sqrt(length);
+		if (halfWidth > centreFreq) halfWidth = (halfWidth + centreFreq)*0.5;
+		fillKaiserSinc(data, length, centreFreq - halfWidth, centreFreq + halfWidth);
+	}
+
+	/** 2x FIR oversampling for block-based processing.
+ 
+		\diagram{rates-oversampler2xfir-responses-45.svg,Upsample response for various lengths}
+	
+		The oversampled signal is stored inside this object, with channels accessed via `oversampler[c]`.  For example, you might do:
+		\code{.cpp}
+			// Upsample from multi-channel input (inputBuffers[c][i] is a sample)
+			oversampler.up(inputBuffers, bufferLength)
+			
+			// Modify the contents at the higher rate
+			for (int c = 0; c < 2; ++c) {
+				float *channel = oversampler[c];
+				for (int i = 0; i < bufferLength*2; ++i) {
+					channel[i] = std::abs(channel[i]);
+				}
+			}
+			
+			// Downsample into the multi-channel output
+			oversampler.down(outputBuffers, bufferLength);
+		\endcode
+
+		The performance depends not just on the length, but also on where you end the passband, allowing a wider/narrower transition band.  Frequencies above this limit (relative to the lower sample-rate) may alias when upsampling and downsampling.
+
+		\diagram{rates-oversampler2xfir-lengths.svg,Resample error rates for different passband thresholds}
+	
+		Since both upsample and downsample are stateful, channels are meaningful.  If your input channel-count doesn't match your output, you can size it to the larger of the two, and use `.upChannel()` and `.downChannel()` to only process the channels which exist.*/
+	template<typename Sample>
+	struct Oversampler2xFIR {
+		Oversampler2xFIR() : Oversampler2xFIR(0, 0) {}
+		Oversampler2xFIR(int channels, int maxBlock, int halfLatency=16, double passFreq=0.43) {
+			resize(channels, maxBlock, halfLatency, passFreq);
+		}
+		
+		void resize(int nChannels, int maxBlockLength) {
+			resize(nChannels, maxBlockLength, oneWayLatency);
+		}
+		void resize(int nChannels, int maxBlockLength, int halfLatency, double passFreq=0.43) {
+			oneWayLatency = halfLatency;
+			kernelLength = oneWayLatency*2;
+			channels = nChannels;
+			halfSampleKernel.resize(kernelLength);
+			fillKaiserSinc(halfSampleKernel, kernelLength, passFreq, 1 - passFreq);
+			inputStride = kernelLength + maxBlockLength;
+			inputBuffer.resize(channels*inputStride);
+			stride = (maxBlockLength + kernelLength)*2;
+			buffer.resize(stride*channels);
+		}
+
+		void reset() {
+			inputBuffer.assign(inputBuffer.size(), 0);
+			buffer.assign(buffer.size(), 0);
+		}
+
+		/// @brief Round-trip latency (or equivalently: upsample latency at the higher rate).
+		/// This will be twice the value passed into the constructor or `.resize()`.
+		int latency() const {
+			return kernelLength;
+		}
+		
+		/// Upsamples from a multi-channel input into the internal buffer
+		template<class Data>
+		void up(Data &&data, int lowSamples) {
+			for (int c = 0; c < channels; ++c) {
+				upChannel(c, data[c], lowSamples);
+			}
+		}
+
+		/// Upsamples a single-channel input into the internal buffer
+		template<class Data>
+		void upChannel(int c, Data &&data, int lowSamples) {
+			Sample *inputChannel = inputBuffer.data() + c*inputStride;
+			for (int i = 0; i < lowSamples; ++i) {
+				inputChannel[kernelLength + i] = data[i];
+			}
+			Sample *output = (*this)[c];
+			for (int i = 0; i < lowSamples; ++i) {
+				output[2*i] = inputChannel[i + oneWayLatency];
+				Sample *offsetInput = inputChannel + (i + 1);
+				Sample sum = 0;
+				for (int o = 0; o < kernelLength; ++o) {
+					sum += offsetInput[o]*halfSampleKernel[o];
+				}
+				output[2*i + 1] = sum;
+			}
+			// Copy the end of the buffer back to the beginning
+			for (int i = 0; i < kernelLength; ++i) {
+				inputChannel[i] = inputChannel[lowSamples + i];
+			}
+		}
+
+		/// Downsamples from the internal buffer to a multi-channel output
+		template<class Data>
+		void down(Data &&data, int lowSamples) {
+			for (int c = 0; c < channels; ++c) {
+				downChannel(c, data[c], lowSamples);
+			}
+		}
+
+		/// Downsamples a single channel from the internal buffer to a single-channel output
+		template<class Data>
+		void downChannel(int c, Data &&data, int lowSamples) {
+			Sample *input = buffer.data() + c*stride; // no offset for latency
+			for (int i = 0; i < lowSamples; ++i) {
+				Sample v1 = input[2*i + kernelLength];
+				Sample sum = 0;
+				for (int o = 0; o < kernelLength; ++o) {
+					Sample v2 = input[2*(i + o) + 1];
+					sum += v2*halfSampleKernel[o];
+				}
+				Sample v2 = sum;
+				Sample v = (v1 + v2)*Sample(0.5);
+				data[i] = v;
+			}
+			// Copy the end of the buffer back to the beginning
+			for (int i = 0; i < kernelLength*2; ++i) {
+				input[i] = input[lowSamples*2 + i];
+			}
+		}
+
+		/// Gets the samples for a single (higher-rate) channel.  The valid length depends how many input samples were passed into `.up()`/`.upChannel()`.
+		Sample * operator[](int c) {
+			return buffer.data() + kernelLength*2 + stride*c;
+		}
+		const Sample * operator[](int c) const {
+			return buffer.data() + kernelLength*2 + stride*c;
+		}
+
+	private:
+		int oneWayLatency, kernelLength;
+		int channels;
+		int stride, inputStride;
+		std::vector<Sample> inputBuffer;
+		std::vector<Sample> halfSampleKernel;
+		std::vector<Sample> buffer;
+	};
+
+/** @} */
+}} // namespace
+#endif // include guard
diff --git a/dsp/spectral.h b/dsp/spectral.h
index 823571d..d8b11a7 100644
--- a/dsp/spectral.h
+++ b/dsp/spectral.h
@@ -1,7 +1,8 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_SPECTRAL_H
 #define SIGNALSMITH_DSP_SPECTRAL_H
 
-#include "./common.h"
 #include "./perf.h"
 #include "./fft.h"
 #include "./windows.h"
@@ -212,13 +213,18 @@ namespace spectral {
 			this->_fftSize = fftSize;
 			this->_interval = newInterval;
 			validUntilIndex = -1;
-			
-			using Kaiser = ::signalsmith::windows::Kaiser;
 
-			/// Roughly optimal Kaiser for STFT analysis (forced to perfect reconstruction)
 			auto &window = fft.setSizeWindow(fftSize);
-			auto kaiser = Kaiser::withBandwidth(windowSize/(double)_interval, true);
-			kaiser.fill(window, windowSize);
+			if (windowShape == Window::kaiser) {
+				using Kaiser = ::signalsmith::windows::Kaiser;
+				/// Roughly optimal Kaiser for STFT analysis (forced to perfect reconstruction)
+				auto kaiser = Kaiser::withBandwidth(windowSize/double(_interval), true);
+				kaiser.fill(window, windowSize);
+			} else {
+				using Confined = ::signalsmith::windows::ApproximateConfinedGaussian;
+				auto confined = Confined::withBandwidth(windowSize/double(_interval));
+				confined.fill(window, windowSize);
+			}
 			::signalsmith::windows::forcePerfectReconstruction(window, windowSize, _interval);
 			
 			// TODO: fill extra bits of an input buffer with NaN/Infinity, to break this, and then fix by adding zero-padding to WindowedFFT (as opposed to zero-valued window sections)
@@ -230,10 +236,19 @@ namespace spectral {
 			timeBuffer.resize(fftSize);
 		}
 	public:
+		/** Swaps between the default (Kaiser) shape and Approximate Confined Gaussian (ACG).
+		\diagram{stft-windows.svg,Default (Kaiser) windows and partial cumulative sum}
+		The ACG has better rolloff since its edges go to 0:
+		\diagram{stft-windows-acg.svg,ACG windows and partial cumulative sum}
+		However, it generally has worse performance in terms of total sidelobe energy, affecting worst-case aliasing levels for (most) higher overlap ratios:
+		\diagram{stft-aliasing-simulated-acg.svg,Simulated bad-case aliasing for ACG windows - compare with above}*/
+		enum class Window {kaiser, acg};
+		Window windowShape = Window::kaiser;
+		
 		using Spectrum = MultiSpectrum;
 		Spectrum spectrum;
 		WindowedFFT<Sample> fft;
-
+		
 		STFT() {}
 		/// Parameters passed straight to `.resize()`
 		STFT(int channels, int windowSize, int interval, int historyLength=0, int zeroPadding=0) {
diff --git a/dsp/windows.h b/dsp/windows.h
index 20f35a5..59f2f33 100644
--- a/dsp/windows.h
+++ b/dsp/windows.h
@@ -1,9 +1,10 @@
+#include "./common.h"
+
 #ifndef SIGNALSMITH_DSP_WINDOWS_H
 #define SIGNALSMITH_DSP_WINDOWS_H
 
-#include "./common.h"
-
 #include <cmath>
+#include <algorithm>
 
 namespace signalsmith {
 namespace windows {
@@ -77,7 +78,7 @@ namespace windows {
 		/// @name Performance methods
 		/// @{
 		/** @brief Total energy ratio (in dB) between side-lobes and the main lobe.
-			\diagram{kaiser-bandwidth-sidelobes-energy.svg,Measured main/side lobe energy ratio.  You can see that the heuristic improves performance for all bandwidth values.}
+			\diagram{windows-kaiser-sidelobe-energy.svg,Measured main/side lobe energy ratio.  You can see that the heuristic improves performance for all bandwidth values.}
 			This function uses an approximation which is accurate to ±0.5dB for 2 ⩽ bandwidth ≤ 10, or 1 ⩽ bandwidth ≤ 10 when `heuristicOptimal`is enabled.
 		*/
 		static double bandwidthToEnergyDb(double bandwidth, bool heuristicOptimal=false) {
@@ -105,7 +106,7 @@ namespace windows {
 			return bw;
 		}
 		/** @brief Peak ratio (in dB) between side-lobes and the main lobe.
-			\diagram{kaiser-bandwidth-sidelobes-peak.svg,Measured main/side lobe peak ratio.  You can see that the heuristic improves performance, except in the bandwidth range 1-2 where peak ratio was sacrificed to improve total energy ratio.}
+			\diagram{windows-kaiser-sidelobe-peaks.svg,Measured main/side lobe peak ratio.  You can see that the heuristic improves performance, except in the bandwidth range 1-2 where peak ratio was sacrificed to improve total energy ratio.}
 			This function uses an approximation which is accurate to ±0.5dB for 2 ⩽ bandwidth ≤ 9, or 0.5 ⩽ bandwidth ≤ 9 when `heuristicOptimal`is enabled.
 		*/
 		static double bandwidthToPeakDb(double bandwidth, bool heuristicOptimal=false) {
@@ -134,7 +135,7 @@ namespace windows {
 		/** @} */
 
 		/** Equivalent noise bandwidth (ENBW), a measure of frequency resolution.
-			\diagram{kaiser-bandwidth-enbw.svg,Measured ENBW, with and without the heuristic bandwidth adjustment.}
+			\diagram{windows-kaiser-enbw.svg,Measured ENBW\, with and without the heuristic bandwidth adjustment.}
 			This approximation is accurate to ±0.05 up to a bandwidth of 22.
 		*/
 		static double bandwidthToEnbw(double bandwidth, bool heuristicOptimal=false) {
@@ -152,7 +153,7 @@ namespace windows {
 	
 		/// Fills an arbitrary container with a Kaiser window
 		template<typename Data>
-		void fill(Data &data, int size) const {
+		void fill(Data &&data, int size) const {
 			double invSize = 1.0/size;
 			for (int i = 0; i < size; ++i) {
 				double r = (2*i + 1)*invSize - 1;
@@ -162,12 +163,45 @@ namespace windows {
 		}
 	};
 
+	/** @brief The Approximate Confined Gaussian window is (almost) optimal
+		
+		ACG windows can be constructing using the shape-parameter (sigma) or using the static `with???()` methods.*/
+	class ApproximateConfinedGaussian {
+		double gaussianFactor;
+		
+		double gaussian(double x) const {
+			return std::exp(-x*x*gaussianFactor);
+		}
+	public:
+		/// Heuristic map from bandwidth to the appropriately-optimal sigma
+		static double bandwidthToSigma(double bandwidth) {
+			return 0.3/std::sqrt(bandwidth);
+		}
+		static ApproximateConfinedGaussian withBandwidth(double bandwidth) {
+			return ApproximateConfinedGaussian(bandwidthToSigma(bandwidth));
+		}
+
+		ApproximateConfinedGaussian(double sigma) : gaussianFactor(0.0625/(sigma*sigma)) {}
+	
+		/// Fills an arbitrary container
+		template<typename Data>
+		void fill(Data &&data, int size) const {
+			double invSize = 1.0/size;
+			double offsetScale = gaussian(1)/(gaussian(3) + gaussian(-1));
+			double norm = 1/(gaussian(0) - 2*offsetScale*(gaussian(2)));
+			for (int i = 0; i < size; ++i) {
+				double r = (2*i + 1)*invSize - 1;
+				data[i] = norm*(gaussian(r) - offsetScale*(gaussian(r - 2) + gaussian(r + 2)));
+			}
+		}
+	};
+
 	/** Forces STFT perfect-reconstruction (WOLA) on an existing window, for a given STFT interval.
 	For example, here are perfect-reconstruction versions of the approximately-optimal @ref Kaiser windows:
 	\diagram{kaiser-windows-heuristic-pr.svg,Note the lower overall energy\, and the pointy top for 2x bandwidth. Spectral performance is about the same\, though.}
 	*/
 	template<typename Data>
-	void forcePerfectReconstruction(Data &data, int windowLength, int interval) {
+	void forcePerfectReconstruction(Data &&data, int windowLength, int interval) {
 		for (int i = 0; i < interval; ++i) {
 			double sum2 = 0;
 			for (int index = i; index < windowLength; index += interval) {