Add reflected pre-roll to .outputSeek()

.flush() processes new output (zero-valued input) for longer lengths
Use .outputSeek() for .exact()
2025-08-11 16:37:44 +01:00 · 2025-08-11 14:54:32 +01:00 · 2025-08-10 21:15:04 +01:00 · 2025-08-10 20:15:45 +01:00 · 2025-08-10 20:15:45 +01:00 · 2025-08-10 20:15:23 +01:00
20 changed files with 461 additions and 1234 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "cmd/util"]
+	path = cmd/util
+	url = https://github.com/geraintluff/util.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -7,7 +7,7 @@ include(FetchContent)
 FetchContent_Declare(
 	signalsmith-linear
 	GIT_REPOSITORY https://github.com/Signalsmith-Audio/linear.git
-	GIT_TAG 0.1.2
+	GIT_TAG 0.2.3
 	GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(signalsmith-linear)
--- a/README.md
+++ b/README.md
@ -149,7 +149,7 @@ The algorithm has a lot of number-crunching, so Debug builds are much slower (up

 ### Dependencies and `#define`s

-This uses the [Signalsmith Linear](https://github.com/Signalsmith-Audio/linear) library for FFTs and other speedups.  There are [flags]([Linear repo](https://github.com/Signalsmith-Audio/linear?tab=readme-ov-file#building)) to enable Accelerate (`SIGNALSMITH_USE_ACCELERATE`) or IPP (`SIGNALSMITH_USE_IPP`).
+This uses the [Signalsmith Linear](https://github.com/Signalsmith-Audio/linear) library for FFTs and other speedups.  There are [flags]([Linear repo](https://github.com/Signalsmith-Audio/linear?tab=readme-ov-file#building)) to use faster FFT implementations (`SIGNALSMITH_USE_ACCELERATE`/`SIGNALSMITH_USE_IPP`/`SIGNALSMITH_USE_PFFFT`/`SIGNALSMITH_USE_PFFFT_DOUBLE`), and corresponding CMake options.

 ## License

--- a/SUPPORT.txt
+++ b/SUPPORT.txt
@ -0,0 +1,3 @@
+# See: https://github.com/geraintluff/SUPPORT.txt
+
+2030-01-01 Geraint Luff <geraint@signalsmith-audio.co.uk>
--- a/cmd/Makefile
+++ b/cmd/Makefile
@ -11,19 +11,19 @@ out/stretch: main.cpp ../signalsmith-stretch.h util/*.h util/*.hxx
 # Uses input files from: https://signalsmith-audio.co.uk/code/stretch/inputs.zip
 examples: out/stretch
 	mkdir -p out/examples
-	inputs/run-all.sh out/examples/u2- out/stretch --semitones=2 --exact
-	inputs/run-all.sh out/examples/d2- out/stretch --semitones=-2 --exact
-	inputs/run-all.sh out/examples/u4- out/stretch --semitones=4 --exact
-	inputs/run-all.sh out/examples/d4- out/stretch --semitones=-4 --exact
-	inputs/run-all.sh out/examples/u8- out/stretch --semitones=8 --exact
-	inputs/run-all.sh out/examples/d8- out/stretch --semitones=-8 --exact
-	inputs/run-all.sh out/examples/u16- out/stretch --semitones=16 --exact
-	inputs/run-all.sh out/examples/d16- out/stretch --semitones=-16 --exact
-	inputs/run-all.sh out/examples/t_8- out/stretch --time=0.8 --exact
-	inputs/run-all.sh out/examples/t1_2- out/stretch --time=1.2 --exact
-	inputs/run-all.sh out/examples/t1_5- out/stretch --time=1.5 --exact
-	inputs/run-all.sh out/examples/t2- out/stretch --time=2 --exact
-	inputs/run-all.sh out/examples/t4- out/stretch --time=4 --exact
+	inputs/run-all.sh out/examples/u2- out/stretch --semitones=2
+	inputs/run-all.sh out/examples/d2- out/stretch --semitones=-2
+	inputs/run-all.sh out/examples/u4- out/stretch --semitones=4
+	inputs/run-all.sh out/examples/d4- out/stretch --semitones=-4
+	inputs/run-all.sh out/examples/u8- out/stretch --semitones=8
+	inputs/run-all.sh out/examples/d8- out/stretch --semitones=-8
+	inputs/run-all.sh out/examples/u16- out/stretch --semitones=16
+	inputs/run-all.sh out/examples/d16- out/stretch --semitones=-16
+	inputs/run-all.sh out/examples/t_8- out/stretch --time=0.8
+	inputs/run-all.sh out/examples/t1_2- out/stretch --time=1.2
+	inputs/run-all.sh out/examples/t1_5- out/stretch --time=1.5
+	inputs/run-all.sh out/examples/t2- out/stretch --time=2
+	inputs/run-all.sh out/examples/t4- out/stretch --time=4

 TEST_WAV ?= "inputs/voice.wav"

--- a/cmd/main-dev.cpp
+++ b/cmd/main-dev.cpp
@ -0,0 +1,234 @@
+// helper for debugging
+#include <iostream>
+#define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << "\n";
+
+#define PROFILE_PLOT_CHUNKS
+#ifdef PROFILE_PLOT_CHUNKS
+size_t activeStepIndex = 0;
+void profileProcessStart(int, int);
+void profileProcessEndStep();
+void profileProcessStep(size_t, size_t);
+void profileProcessEnd();
+#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_START profileProcessStart
+#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP profileProcessStep
+#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP profileProcessEndStep
+#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_END profileProcessEnd
+#endif
+
+#include "signalsmith-stretch/signalsmith-stretch.h"
+
+#include "./util/stopwatch.h"
+#include "./util/memory-tracker.hxx"
+#include "./util/simple-args.h"
+#include "./util/wav.h"
+
+#ifdef PROFILE_PLOT_CHUNKS
+#include "plot/plot.h"
+std::vector<signalsmith::Stopwatch> processStopwatches;
+signalsmith::Stopwatch processStopwatchStart, processStopwatchEnd;
+bool started = false;
+bool activeStep = false;
+void profileProcessStart(int /*inputSamples*/, int /*outputSamples*/) {
+	activeStep = false;
+	started = true;
+	processStopwatchStart.startLap();
+}
+void profileProcessEndStep() {
+	if (activeStep) {
+		activeStep = false;
+		processStopwatches[activeStepIndex].lap();
+	} else if (started) {
+		started = false;
+		processStopwatchStart.lap();
+	}
+	processStopwatchEnd.startLap();
+}
+void profileProcessStep(size_t step, size_t count) {
+	profileProcessEndStep();
+	activeStep = true;
+	activeStepIndex = step;
+	if (processStopwatches.size() < count) {
+		processStopwatches.resize(count);
+	}
+	processStopwatches[step].startLap();
+}
+void profileProcessEnd() {
+	processStopwatchEnd.lap();
+}
+#endif
+
+int main(int argc, char* argv[]) {
+	signalsmith::stretch::SignalsmithStretch<float/*, std::ranlux48_base*/> stretch; // optional cheaper RNG for performance comparison
+
+#ifdef PROFILE_PLOT_CHUNKS
+	processStopwatches.reserve(1000);
+#endif
+
+	SimpleArgs args(argc, argv);
+	
+	if (args.hasFlag("v", "prints the version")) {
+		std::cout << stretch.version[0] << "." << stretch.version[1] << "." << stretch.version[2] << "\n";
+		return 0;
+	}
+	
+	std::string inputWav = args.arg<std::string>("input.wav", "16-bit WAV file");
+	std::string outputWav = args.arg<std::string>("output.wav", "output WAV file");
+	
+	double semitones = args.flag<double>("semitones", "pitch-shift amount", 0);
+	double formants = args.flag<double>("formant", "formant-shift amount (semitones)", 0);
+	bool formantComp = args.hasFlag("formant-comp", "formant compensation");
+	double formantBase = args.flag<double>("formant-base", "formant base frequency (Hz, 0=auto)", 0);
+	double tonality = args.flag<double>("tonality", "tonality limit (Hz)", 8000);
+	double time = args.flag<double>("time", "time-stretch factor", 1);
+	bool exactLength = args.hasFlag("exact", "trims the start/end so the output has the correct length");
+	bool splitComputation = args.hasFlag("split-computation", "distributes the computation more evenly (but higher latency)");
+	args.errorExit();
+	
+	std::cout << Console::Bright << inputWav << Console::Reset;
+	std::cout << " -> ";
+	std::cout << Console::Bright << outputWav << Console::Reset << "\n";
+	std::cout << "\tsemitones: " << semitones << "\n\t     time: " << time << "x" << (exactLength ? " (exact)" : "") << "\n\t tonality: " << tonality << "Hz\n";
+
+	Wav inWav;
+	std::cout << inputWav << " -> " << outputWav << "\n";
+	if (!inWav.read(inputWav).warn()) args.errorExit("failed to read WAV");
+	size_t inputLength = inWav.samples.size()/inWav.channels;
+	
+	Wav prevWav; // Used during development, to compare against known-good previous render
+	bool compareReference = (time <= 1.6);
+	if (compareReference && !prevWav.read(outputWav + "-reference.wav")) {
+		if (prevWav.read(outputWav)) {
+			prevWav.write(outputWav + "-reference.wav");
+		}
+	}
+
+	Wav outWav;
+	outWav.channels = inWav.channels;
+	outWav.sampleRate = inWav.sampleRate;
+	int outputLength = std::round(inputLength*time);
+
+	signalsmith::MemoryTracker initMemory;
+	signalsmith::Stopwatch stopwatch;
+
+	stopwatch.start();
+	stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation);
+	stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
+	stretch.setFormantSemitones(formants, formantComp);
+	stretch.setFormantBase(formantBase/inWav.sampleRate);
+	double initSeconds = stopwatch.lap();
+
+	initMemory = initMemory.diff();
+	std::cout << "Setup:\n\t" << initSeconds << "s\n";
+	if (initMemory.implemented) {
+		std::cout << "\tallocated " << (initMemory.allocBytes/1000) << "kB, freed " << (initMemory.freeBytes/1000) << "kB\n";
+	}
+
+	signalsmith::MemoryTracker processMemory;
+
+	if (exactLength) {
+		outWav.samples.resize(outputLength*outWav.channels);
+		stopwatch.start();
+		processMemory = {};
+		stretch.exact(inWav, int(inputLength), outWav, int(outputLength));
+	} else {
+		// pad the input at the end, since we'll be reading slightly ahead
+		size_t paddedInputLength = inputLength + stretch.inputLatency();
+		inWav.samples.resize(paddedInputLength*inWav.channels);
+		// pad the output at the end, since we have output latency as well
+		int tailSamples = exactLength ? stretch.outputLatency() : (stretch.outputLatency() + stretch.inputLatency()); // if we don't need exact length, add a bit more output to catch any wobbles past the end
+		int paddedOutputLength = outputLength + tailSamples;
+		outWav.samples.resize(paddedOutputLength*outWav.channels);
+
+		stopwatch.start();
+		// The simplest way to deal with input latency (when have access to the audio buffer) is to always be slightly ahead in the input
+		stretch.seek(inWav, stretch.inputLatency(), 1/time);
+		inWav.offset += stretch.inputLatency();
+		// Process it all in one call, although it works just the same if we split into smaller blocks
+		processMemory = {};
+		stretch.process(inWav, int(inputLength), outWav, int(outputLength));
+		// Read the last bit of output without giving it any more input
+		outWav.offset += outputLength;
+		stretch.flush(outWav, tailSamples);
+		outWav.offset -= outputLength;
+	}
+
+	double processSeconds = stopwatch.lap();
+	double processRate = (inWav.length()/inWav.sampleRate)/processSeconds;
+	double processPercent = 100/processRate;
+	processMemory = processMemory.diff();
+	std::cout << "Process:\n\t" << processSeconds << "s, " << processRate << "x realtime, " << processPercent << "% CPU\n";
+	if (processMemory.implemented) {
+		std::cout << "\tallocated " << (processMemory.allocBytes/1000) << "kB, freed " << (processMemory.freeBytes/1000) << "kB\n";
+		if (processMemory) args.errorExit("allocated during process()");
+	}
+	
+#ifdef PROFILE_PLOT_CHUNKS
+	signalsmith::plot::Figure figure;
+	auto &plot = figure(0, 0).plot(400, 150);
+	plot.x.blank().label("step");
+	plot.y.major(0, "");
+	plot.title("computation time");
+	auto &cumulativePlot = figure(1, 0).plot(150, 150);
+	cumulativePlot.x.major(processStopwatches.size(), "");
+	cumulativePlot.y.major(0, "");
+	cumulativePlot.title("cumulative");
+	auto &line = plot.line().fillToY(0);
+	auto &extraLine = plot.line().fillToY(0);
+	auto &cumulativeLine = cumulativePlot.line();
+	auto &flatLine = cumulativePlot.line();
+	double cumulativeTime = 0;
+	line.add(0, 0);
+	cumulativeLine.add(0, 0);
+	for (size_t i = 0; i < processStopwatches.size(); ++i) {
+		double time = processStopwatches[i].total();
+		if (i%5 == 0) {
+			plot.x.tick(i + 0.5, std::to_string(i));
+		} else {
+			plot.x.tick(i + 0.5, "");
+		}
+		line.add(i, time);
+		line.add(i + 1, time);
+
+		cumulativeTime += time;
+		cumulativeLine.add(i, cumulativeTime);
+		cumulativeLine.add(i + 1, cumulativeTime);
+	}
+	line.add(processStopwatches.size(), 0);
+	extraLine.add(0, 0);
+	extraLine.add(0, processStopwatchStart.total());
+	extraLine.add(1, processStopwatchStart.total());
+	extraLine.add(1, 0);
+	extraLine.add(processStopwatches.size() - 1, 0);
+	extraLine.add(processStopwatches.size() - 1, processStopwatchEnd.total());
+	extraLine.add(processStopwatches.size(), processStopwatchEnd.total());
+	extraLine.add(processStopwatches.size(), 0);
+	flatLine.add(0, 0);
+	flatLine.add(processStopwatches.size(), cumulativeTime);
+	figure.write("profile.svg");
+#endif
+
+	if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV");
+	
+	if (compareReference && prevWav.result) {
+		outWav.read(outputWav);
+		if (prevWav.length() != outWav.length()) args.errorExit("lengths differ");
+		double diff2 = 0;
+		for (size_t i = 0; i < prevWav.samples.size(); ++i) {
+			double diff = prevWav.samples[i] - outWav.samples[i];
+			diff2 += diff*diff;
+		}
+		diff2 /= prevWav.samples.size();
+		double diffDb = 10*std::log10(diff2);
+		std::cout << "Reference:\n\tdifference: ";
+		if (diff2 < 1e-6) {
+			std::cout << Console::Yellow;
+		} else if (diff2 < 1e-10) {
+			std::cout << Console::Green;
+		} else {
+			std::cout << Console::Red;
+		}
+		
+		std::cout << Console::Bright << diffDb << Console::Reset << " dB\n";
+		if (diffDb > -60) args.errorExit("too much difference\n");
+	}
+}
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@ -2,233 +2,85 @@
 #include <iostream>
 #define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << "\n";

-#define PROFILE_PLOT_CHUNKS
-#ifdef PROFILE_PLOT_CHUNKS
-size_t activeStepIndex = 0;
-void profileProcessStart(int, int);
-void profileProcessEndStep();
-void profileProcessStep(size_t, size_t);
-void profileProcessEnd();
-#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_START profileProcessStart
-#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP profileProcessStep
-#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP profileProcessEndStep
-#	define SIGNALSMITH_STRETCH_PROFILE_PROCESS_END profileProcessEnd
-#endif
-
 #include "signalsmith-stretch/signalsmith-stretch.h"
+using SignalsmithStretch = signalsmith::stretch::SignalsmithStretch<float>;

-#include "./util/stopwatch.h"
-#include "./util/memory-tracker.hxx"
 #include "./util/simple-args.h"
 #include "./util/wav.h"

-#ifdef PROFILE_PLOT_CHUNKS
-#include "plot/plot.h"
-std::vector<signalsmith::Stopwatch> processStopwatches;
-signalsmith::Stopwatch processStopwatchStart, processStopwatchEnd;
-bool started = false;
-bool activeStep = false;
-void profileProcessStart(int /*inputSamples*/, int /*outputSamples*/) {
-	activeStep = false;
-	started = true;
-	processStopwatchStart.startLap();
-}
-void profileProcessEndStep() {
-	if (activeStep) {
-		activeStep = false;
-		processStopwatches[activeStepIndex].lap();
-	} else if (started) {
-		started = false;
-		processStopwatchStart.lap();
-	}
-	processStopwatchEnd.startLap();
-}
-void profileProcessStep(size_t step, size_t count) {
-	profileProcessEndStep();
-	activeStep = true;
-	activeStepIndex = step;
-	if (processStopwatches.size() < count) {
-		processStopwatches.resize(count);
-	}
-	processStopwatches[step].startLap();
-}
-void profileProcessEnd() {
-	processStopwatchEnd.lap();
-}
-#endif
-
 int main(int argc, char* argv[]) {
-	signalsmith::stretch::SignalsmithStretch<float/*, std::ranlux48_base*/> stretch; // optional cheaper RNG for performance comparison
-
-#ifdef PROFILE_PLOT_CHUNKS
-	processStopwatches.reserve(1000);
-#endif
-
 	SimpleArgs args(argc, argv);
 	
 	if (args.hasFlag("v", "prints the version")) {
-		std::cout << stretch.version[0] << "." << stretch.version[1] << "." << stretch.version[2] << "\n";
+		auto &version = SignalsmithStretch::version;
+		std::cout << version[0] << "." << version[1] << "." << version[2] << "\n";
 		return 0;
 	}

 	std::string inputWav = args.arg<std::string>("input.wav", "16-bit WAV file");
 	std::string outputWav = args.arg<std::string>("output.wav", "output WAV file");
-	
 	double semitones = args.flag<double>("semitones", "pitch-shift amount", 0);
 	double formants = args.flag<double>("formant", "formant-shift amount (semitones)", 0);
 	bool formantComp = args.hasFlag("formant-comp", "formant compensation");
-	double formantBase = args.flag<double>("formant-base", "formant base frequency (Hz, 0=auto)", 0);
+	double formantBase = args.flag<double>("formant-base", "formant base frequency (Hz, 0=auto)", 100);
 	double tonality = args.flag<double>("tonality", "tonality limit (Hz)", 8000);
 	double time = args.flag<double>("time", "time-stretch factor", 1);
-	bool exactLength = args.hasFlag("exact", "trims the start/end so the output has the correct length");
 	bool splitComputation = args.hasFlag("split-computation", "distributes the computation more evenly (but higher latency)");
-	args.errorExit();
+	args.errorExit(); // exits on error, or with `--help`

-	std::cout << Console::Bright << inputWav << Console::Reset;
-	std::cout << " -> ";
-	std::cout << Console::Bright << outputWav << Console::Reset << "\n";
-	std::cout << "\tsemitones: " << semitones << "\n\t     time: " << time << "x" << (exactLength ? " (exact)" : "") << "\n\t tonality: " << tonality << "Hz\n";
+	std::cout << inputWav << " -> " << outputWav << "\n";

 	Wav inWav;
-	std::cout << inputWav << " -> " << outputWav << "\n";
 	if (!inWav.read(inputWav).warn()) args.errorExit("failed to read WAV");
-	size_t inputLength = inWav.samples.size()/inWav.channels;
 	
-	Wav prevWav; // Used during development, to compare against known-good previous render
-	bool compareReference = (time <= 1.6);
-	if (compareReference && !prevWav.read(outputWav + "-reference.wav")) {
-		if (prevWav.read(outputWav)) {
-			prevWav.write(outputWav + "-reference.wav");
-		}
-	}
+	size_t inputLength = inWav.length();
+	size_t outputLength = std::round(inputLength*time);
 	
 	Wav outWav;
 	outWav.channels = inWav.channels;
 	outWav.sampleRate = inWav.sampleRate;
-	int outputLength = std::round(inputLength*time);
+	outWav.resize(outputLength);

-	signalsmith::MemoryTracker initMemory;
-	signalsmith::Stopwatch stopwatch;
-
-	stopwatch.start();
+	SignalsmithStretch stretch;
 	stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation);
 	stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
 	stretch.setFormantSemitones(formants, formantComp);
 	stretch.setFormantBase(formantBase/inWav.sampleRate);
-	double initSeconds = stopwatch.lap();

-	initMemory = initMemory.diff();
-	std::cout << "Setup:\n\t" << initSeconds << "s\n";
-	if (initMemory.implemented) {
-		std::cout << "\tallocated " << (initMemory.allocBytes/1000) << "kB, freed " << (initMemory.freeBytes/1000) << "kB\n";
-	}
+	/* Since the WAV helper allows sample access like `wav[c][index]`, we could just call:
 	
-	signalsmith::MemoryTracker processMemory;
-
-	if (exactLength) {
-		outWav.samples.resize(outputLength*outWav.channels);
-		stopwatch.start();
-		processMemory = {};
 		stretch.exact(inWav, int(inputLength), outWav, int(outputLength));
-	} else {
-		// pad the input at the end, since we'll be reading slightly ahead
-		size_t paddedInputLength = inputLength + stretch.inputLatency();
-		inWav.samples.resize(paddedInputLength*inWav.channels);
-		// pad the output at the end, since we have output latency as well
-		int tailSamples = exactLength ? stretch.outputLatency() : (stretch.outputLatency() + stretch.inputLatency()); // if we don't need exact length, add a bit more output to catch any wobbles past the end
-		int paddedOutputLength = outputLength + tailSamples;
-		outWav.samples.resize(paddedOutputLength*outWav.channels);
 		
-		stopwatch.start();
-		// The simplest way to deal with input latency (when have access to the audio buffer) is to always be slightly ahead in the input
-		stretch.seek(inWav, stretch.inputLatency(), 1/time);
-		inWav.offset += stretch.inputLatency();
-		// Process it all in one call, although it works just the same if we split into smaller blocks
-		processMemory = {};
-		stretch.process(inWav, int(inputLength), outWav, int(outputLength));
-		// Read the last bit of output without giving it any more input
-		outWav.offset += outputLength;
-		stretch.flush(outWav, tailSamples);
-		outWav.offset -= outputLength;
-	}
+	However, we'll do it in separate stages to show more of the API. */
 	
-	double processSeconds = stopwatch.lap();
-	double processRate = (inWav.length()/inWav.sampleRate)/processSeconds;
-	double processPercent = 100/processRate;
-	processMemory = processMemory.diff();
-	std::cout << "Process:\n\t" << processSeconds << "s, " << processRate << "x realtime, " << processPercent << "% CPU\n";
-	if (processMemory.implemented) {
-		std::cout << "\tallocated " << (processMemory.allocBytes/1000) << "kB, freed " << (processMemory.freeBytes/1000) << "kB\n";
-		if (processMemory) args.errorExit("allocated during process()");
-	}
+	// First, an "output seek", where we provide a chunk of input.
+	// This is suitable for starting playback of a sample at a given playback rate.
+	auto seekLength = stretch.outputSeekLength(1/time);
+	stretch.outputSeek(inWav, seekLength);
+	// At this point, the next output samples we get will correspond to the beginning of the audio file.

-#ifdef PROFILE_PLOT_CHUNKS
-	signalsmith::plot::Figure figure;
-	auto &plot = figure(0, 0).plot(400, 150);
-	plot.x.blank().label("step");
-	plot.y.major(0, "");
-	plot.title("computation time");
-	auto &cumulativePlot = figure(1, 0).plot(150, 150);
-	cumulativePlot.x.major(processStopwatches.size(), "");
-	cumulativePlot.y.major(0, "");
-	cumulativePlot.title("cumulative");
-	auto &line = plot.line().fillToY(0);
-	auto &extraLine = plot.line().fillToY(0);
-	auto &cumulativeLine = cumulativePlot.line();
-	auto &flatLine = cumulativePlot.line();
-	double cumulativeTime = 0;
-	line.add(0, 0);
-	cumulativeLine.add(0, 0);
-	for (size_t i = 0; i < processStopwatches.size(); ++i) {
-		double time = processStopwatches[i].total();
-		if (i%5 == 0) {
-			plot.x.tick(i + 0.5, std::to_string(i));
-		} else {
-			plot.x.tick(i + 0.5, "");
-		}
-		line.add(i, time);
-		line.add(i + 1, time);
+	// We're going to process until *just* before the end of the audio file (so we can get a tidier end using `.flush()`.
+	int outputIndex = outputLength - stretch.intervalSamples();

-		cumulativeTime += time;
-		cumulativeLine.add(i, cumulativeTime);
-		cumulativeLine.add(i + 1, cumulativeTime);
-	}
-	line.add(processStopwatches.size(), 0);
-	extraLine.add(0, 0);
-	extraLine.add(0, processStopwatchStart.total());
-	extraLine.add(1, processStopwatchStart.total());
-	extraLine.add(1, 0);
-	extraLine.add(processStopwatches.size() - 1, 0);
-	extraLine.add(processStopwatches.size() - 1, processStopwatchEnd.total());
-	extraLine.add(processStopwatches.size(), processStopwatchEnd.total());
-	extraLine.add(processStopwatches.size(), 0);
-	flatLine.add(0, 0);
-	flatLine.add(processStopwatches.size(), cumulativeTime);
-	figure.write("profile.svg");
-#endif
+	// Stretch's internal output position is slightly ahead of the output samples we get
+	int outputPos = outputIndex + stretch.outputLatency();
+	// Time-map: where do we want the input position to be at that moment?
+	int inputPos = std::round(outputPos/time);
+	// And therefore which input samples do we need to be supplying?
+	int inputIndex = inputPos + stretch.inputLatency();
+	
+	// In this particular case, our `inputPos` will be at the end of the file
+	// and `inputIndex` will be beyond the end, so we pad with 0s to have enough input
+	inWav.resize(inputIndex);
+
+	// OK, go for it
+	inWav.offset = seekLength;
+	stretch.process(inWav, inputIndex - seekLength, outWav, outputIndex);
+	
+	// And as promised, get the last bits using `.flush()`, which does some extra stuff to avoid introducing clicks.
+	outWav.offset = outputIndex;
+	stretch.flush(outWav, outputLength - outputIndex);
+	outWav.offset = 0;

 	if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV");
-	
-	if (compareReference && prevWav.result) {
-		outWav.read(outputWav);
-		if (prevWav.length() != outWav.length()) args.errorExit("lengths differ");
-		double diff2 = 0;
-		for (size_t i = 0; i < prevWav.samples.size(); ++i) {
-			double diff = prevWav.samples[i] - outWav.samples[i];
-			diff2 += diff*diff;
-		}
-		diff2 /= prevWav.samples.size();
-		double diffDb = 10*std::log10(diff2);
-		std::cout << "Reference:\n\tdifference: ";
-		if (diff2 < 1e-6) {
-			std::cout << Console::Yellow;
-		} else if (diff2 < 1e-10) {
-			std::cout << Console::Green;
-		} else {
-			std::cout << Console::Red;
-		}
-		
-		std::cout << Console::Bright << diffDb << Console::Reset << " dB\n";
-		if (diffDb > -60) args.errorExit("too much difference\n");
-	}
 }
--- a/cmd/util
+++ b/cmd/util
@ -0,0 +1 @@
+Subproject commit aeb4e31077a453566b58fff1c7e7e998ac824157
--- a/cmd/util/console-colours.h
+++ b/cmd/util/console-colours.h
@ -1,41 +0,0 @@
-#pragma once
-#ifndef _CONSOLE_COLOURS_H
-#define _CONSOLE_COLOURS_H
-
-#include <string>
-
-namespace Console {
-	std::string Reset = "\x1b[0m";
-	std::string Bright = "\x1b[1m";
-	std::string Dim = "\x1b[2m";
-	std::string Underscore = "\x1b[4m";
-	std::string Blink = "\x1b[5m";
-	std::string Reverse = "\x1b[7m";
-	std::string Hidden = "\x1b[8m";
-
-	namespace Foreground {
-		std::string Black = "\x1b[30m";
-		std::string Red = "\x1b[31m";
-		std::string Green = "\x1b[32m";
-		std::string Yellow = "\x1b[33m";
-		std::string Blue = "\x1b[34m";
-		std::string Magenta = "\x1b[35m";
-		std::string Cyan = "\x1b[36m";
-		std::string White = "\x1b[37m";
-	}
-
-	namespace Background {
-		std::string Black = "\x1b[40m";
-		std::string Red = "\x1b[41m";
-		std::string Green = "\x1b[42m";
-		std::string Yellow = "\x1b[43m";
-		std::string Blue = "\x1b[44m";
-		std::string Magenta = "\x1b[45m";
-		std::string Cyan = "\x1b[46m";
-		std::string White = "\x1b[47m";
-	}
-
-	using namespace Foreground;
-}
-
-#endif
--- a/cmd/util/memory-tracker.h
+++ b/cmd/util/memory-tracker.h
@ -1,29 +0,0 @@
-/* Currently only working/tested on Mac.  You need to compile in `memory-tracker.cpp` as well, which does the actual stuff */
-#ifndef SIGNALSMITH_UTIL_MEMORY_TRACKER_H
-#define SIGNALSMITH_UTIL_MEMORY_TRACKER_H
-
-#include <cstddef>
-
-namespace signalsmith {
-
-struct MemoryTracker {
-	static const bool implemented; // Whether the implementation actually tracks memory or not
-
-	size_t allocBytes, freeBytes, currentBytes;
-	MemoryTracker();
-	
-	MemoryTracker diff() const {
-		MemoryTracker now;
-		return {now.allocBytes - allocBytes, now.freeBytes - freeBytes};
-	}
-
-	// Is a `.diff()` result non-zero
-	operator bool() const {
-		return allocBytes > 0 || freeBytes > 0;
-	}
-private:
-	MemoryTracker(size_t allocBytes, size_t freeBytes) : allocBytes(allocBytes), freeBytes(freeBytes), currentBytes(allocBytes - freeBytes) {}
-};
-
-} // namespace
-#endif // include guard
--- a/cmd/util/memory-tracker.hxx
+++ b/cmd/util/memory-tracker.hxx
@ -1,118 +0,0 @@
-#include "./memory-tracker.h"
-
-#if !defined(__has_include) || !__has_include(<dlfcn.h>)
-// Fallback if we don't have <dlfcn.h>, which we use to get the existing methods
-signalsmith::MemoryTracker::MemoryTracker() : signalsmith::MemoryTracker::MemoryTracker(0, 0) {}
-const bool signalsmith::MemoryTracker::implemented = false;
-#else
-const bool signalsmith::MemoryTracker::implemented = true;
-
-#include <cstdlib>
-#include <cstddef>
-#include <dlfcn.h>
-#include <cassert>
-#include <utility>
-
-namespace signalsmith {
-
-namespace memory_tracker {
-
-static size_t memoryTrackerAllocCounter = 0;
-static size_t memoryTrackerFreeCounter = 0;
-
-static void * (*originalCalloc)(size_t, size_t) = nullptr;
-static void * (*originalMalloc)(size_t) = nullptr;
-static void * (*originalRealloc)(void*, size_t) = nullptr;
-static void (*originalFree)(void*) = nullptr;
-
-template<class Fn>
-static void cacheOriginal(Fn& fn, const char *symbolName) {
-	if (!fn) {
-		fn = (Fn)dlsym(RTLD_NEXT, symbolName);
-		if (!fn) exit(1);
-	}
-}
-
-template<class Fn, typename ...Args>
-auto callOriginal(Fn& fn, const char *symbolName, Args &&...args)
-		-> decltype(fn(std::forward<Args>(args)...)) {
-	cacheOriginal(fn, symbolName);
-	return fn(std::forward<Args>(args)...);
-}
-
-static constexpr size_t extraInfoBytes = sizeof(std::max_align_t)*2;
-void * storeAllocInfo(void *offsetPointer, void *originalPointer, size_t size) {
-	if (!originalPointer) return nullptr;
-	memoryTrackerAllocCounter += size;
-	
-	assert(!((size_t(offsetPointer))%sizeof(size_t))); // make sure it's aligned to size_t
-	size_t *sizePtr = (size_t *)offsetPointer;
-	sizePtr[-1] = size_t(originalPointer);
-	sizePtr[-2] = size;
-	return offsetPointer;
-}
-size_t getAllocSize(void *ptr) {
-	assert(!(size_t(ptr)%sizeof(size_t)));
-	size_t *sizePtr = (size_t *)ptr;
-	return sizePtr[-2];
-}
-void * getAllocPointer(void *ptr) {
-	assert(!(size_t(ptr)%sizeof(size_t)));
-	size_t *sizePtr = (size_t *)ptr;
-	return (void *)sizePtr[-1];
-}
-
-}} // namespaces
-
-extern "C" {
-	void * malloc(size_t size) {
-		void *ptr = signalsmith::memory_tracker::callOriginal(signalsmith::memory_tracker::originalMalloc, "malloc", size + signalsmith::memory_tracker::extraInfoBytes);
-		return signalsmith::memory_tracker::storeAllocInfo((unsigned char *)ptr + signalsmith::memory_tracker::extraInfoBytes, ptr, size);
-	}
-
-	void * calloc(size_t size, size_t count) {
-		size_t extraCount = (signalsmith::memory_tracker::extraInfoBytes + size - 1)/size; // enough extra items to store what we need
-		void *ptr = signalsmith::memory_tracker::callOriginal(signalsmith::memory_tracker::originalCalloc, "calloc", size, count + extraCount);
-		return signalsmith::memory_tracker::storeAllocInfo((unsigned char *)ptr + size*extraCount, ptr, size*count);
-	}
-
-	void * realloc(void *ptr, size_t size) {
-		void *originalPtr = signalsmith::memory_tracker::getAllocPointer(ptr);
-		auto pointerOffset = (unsigned char *)ptr - (unsigned char *)originalPtr;
-		size_t originalSize = signalsmith::memory_tracker::getAllocSize(ptr);
-		signalsmith::memory_tracker::memoryTrackerFreeCounter += originalSize;
-		
-		ptr = signalsmith::memory_tracker::callOriginal(signalsmith::memory_tracker::originalRealloc, "realloc", originalPtr, size + pointerOffset);
-		return signalsmith::memory_tracker::storeAllocInfo((unsigned char *)ptr + pointerOffset, ptr, size);
-	}
-
-	void free(void *ptr) {
-		void *originalPtr = signalsmith::memory_tracker::getAllocPointer(ptr);
-		size_t originalSize = signalsmith::memory_tracker::getAllocSize(ptr);
-		signalsmith::memory_tracker::memoryTrackerFreeCounter += originalSize;
-
-		signalsmith::memory_tracker::callOriginal(signalsmith::memory_tracker::originalFree, "free", originalPtr);
-	}
-}
-
-#include <new>
-
-void * operator new(size_t size) {
-	return malloc(size);
-}
-
-void * operator new[](size_t size) {
-	return malloc(size);
-}
-
-void operator delete(void *ptr) noexcept {
-	free(ptr);
-}
-
-void operator delete[](void *ptr) noexcept {
-	free(ptr);
-}
-
-signalsmith::MemoryTracker::MemoryTracker() : signalsmith::MemoryTracker::MemoryTracker(signalsmith::memory_tracker::memoryTrackerAllocCounter, signalsmith::memory_tracker::memoryTrackerFreeCounter) {}
-
-#endif // check for <dlfcn.h>
--- a/cmd/util/simple-args.h
+++ b/cmd/util/simple-args.h
@ -1,322 +0,0 @@
-#include <iostream>
-#include <string>
-#include <cstring>
-#include <vector>
-#include <map>
-#include <set>
-#include <cstdlib> // exit() and codes
-
-#include "console-colours.h"
-
-/** Expected use:
-
-		SimpleArgs args(argc, argv);
-
-		// positional argument
-		std::string foo = args.arg<std::string>("foo");
-		// optional argument
-		std::string bar = args.arg<std::string>("bar", "a string for Bar", "default");
-		// --flag=value
-		double = args.flag<double>("baz", "an optional flag", 5);
-		
-		// Exits if "foo" not supplied
-		args.errorExit();
-
-	If you have multiple commands, each with their own options:
-
-		// Switches based on a command
-		if (args.command("bink", "Bink description")) {
-			// collect arguments for the command
-		}
-		// Exits with a help message (and list of commands) if no command matched
-		args.errorCommand();
-		
-	By default, a flag of "-h" (or a command of "help", if any commands are used) prints a help message.  To override:
-		SimpleArgs args(argc, argv);
-		args.helpFlag("h");
-		args.helpCommand("help");
-	
-**/
-class SimpleArgs {
-	int argc;
-	const char* const* argv;
-
-	template<typename T>
-	T valueFromString(const char *arg);
-	
-	std::string parsedCommand;
-	struct Keywords {
-		std::string keyword;
-		std::string description;
-		bool isHelp;
-	};
-	std::vector<Keywords> keywordOptions;
-	std::vector<Keywords> argDetails;
-	std::vector<Keywords> flagOptions;
-	std::set<std::string> flagSet;
-	void clearKeywords() {
-		keywordOptions.resize(0);
-		flagSet.clear();
-		flagOptions.clear();
-	}
-	
-	bool helpMode = false;
-	bool checkedHelpCommand = false;
-	bool hasError = false;
-	std::string errorMessage;
-	void setError(std::string message) {
-		if (!hasError) {
-			hasError = true;
-			errorMessage = message;
-		}
-	}
-
-	std::map<std::string, std::string> flagMap;
-	void consumeFlags() {
-		while (index < argc && std::strlen(argv[index]) > 1 && argv[index][0] == '-') {
-			const char* arg = argv[index++];
-			size_t length = strlen(arg);
-			
-			size_t keyStart = 1, keyEnd = keyStart + 1;
-			size_t valueStart = keyEnd;
-			// If it's "--long-arg" format
-			if (length > 1 && arg[1] == '-') {
-				keyStart++;
-				while (keyEnd < length && arg[keyEnd] != '=') {
-					keyEnd++;
-				}
-				valueStart = keyEnd;
-				if (keyEnd < length) valueStart++;
-			}
-
-			std::string key = std::string(arg + keyStart, keyEnd - keyStart);
-			std::string value = std::string(arg + valueStart);
-			
-			if (key == "help") {
-				helpMode = true;
-			}
-
-			flagMap[key] = value;
-	 	}
-	 }
-	
-	int index = 1;
-public:
-	SimpleArgs(int argc, const char* const argv[]) : argc(argc), argv(argv) {
-		std::string cmd = argv[0];
-		size_t slashPos = cmd.find_last_of("\\/");
-		if (slashPos != std::string::npos) cmd = cmd.substr(slashPos + 1);
-		parsedCommand = cmd;
-	}
-
-	void help(std::ostream& out=std::cerr) const {
-		std::string parsedCommand = this->parsedCommand;
-		if (keywordOptions.size() > 0) {
-			parsedCommand += std::string(" <command>");
-		}
-		out << "Usage:\n\t" <<  parsedCommand << "\n\n";
-		if (keywordOptions.size() > 0) {
-			out << "Commands:\n";
-			for (unsigned int i = 0; i < keywordOptions.size(); i++) {
-				out << "\t" << keywordOptions[i].keyword;
-				if (keywordOptions[i].isHelp) out << " [command...]";
-				if (keywordOptions[i].description.size()) out << "  -  " << keywordOptions[i].description;
-				out << "\n";
-			}
-			out << "\n";
-		}
-		if (argDetails.size() > 0) {
-			out << "Arguments:\n";
-			for (Keywords const &arg : argDetails) {
-				out << "\t" << arg.keyword;
-				if (arg.description.size()) out << "  -  " << arg.description;
-				out << "\n";
-			}
-			out << "\n";
-		}
-		if (flagOptions.size() > 0) {
-			out << "Options: " << Console::Dim << "(--arg=value)" << Console::Reset << "\n";
-			for (Keywords const &pair : flagOptions) {
-				out << "\t" << (pair.keyword.length() > 1 ? "--" : "-") << pair.keyword;
-				if (pair.description.size()) out << "  -  " << pair.description;
-				out << "\n";
-			}
-			out << "\n";
-		}
-	}
-	
-	bool isHelp() const {
-		return helpMode;
-	}
-	bool finished() const {
-		return index >= argc;
-	}
-	std::string peek() const {
-		return (index >= argc) ? "" : argv[index];
-	}
-
-	int errorExit(std::ostream& out=std::cerr) const {
-		if (hasError || helpMode) {
-			help(out);
-			if (!helpMode) {
-				out << Console::Red << errorMessage << Console::Reset << "\n";
-			}
-			std::exit((!helpMode && hasError) ? EXIT_FAILURE : EXIT_SUCCESS);
-		}
-		return 0;
-	}
-	int errorExit(std::string forcedError, std::ostream& out=std::cerr) const {
-		if (hasError) return errorExit(out); // Argument errors take priority
-		out << Console::Red << forcedError << Console::Reset << "\n";
-		std::exit(EXIT_FAILURE);
-		return 0;
-	}
-	int errorCommand(std::string message="", std::ostream& out=std::cerr) const {
-		if (keywordOptions.size()) {
-			// We expected a command, but didn't match on any
-			if (helpMode) return errorExit(out);
-			if (index >= argc) help(out);
-			if (message.length() == 0) {
-				message = (index < argc) ? std::string("Unknown command: ") + argv[index] : "Missing command";
-			}
-			errorExit(message, out);
-		}
-		return 0;
-	}
-	
-	template<typename T=std::string>
-	T arg(std::string name, std::string longName, T defaultValue) {
-		consumeFlags();
-		if (index < argc) clearKeywords();
-		parsedCommand += std::string(" [") + name + "]";
-		argDetails.push_back(Keywords{name, longName, false});
-
-		if (index >= argc) return defaultValue;
-		return valueFromString<T>(argv[index++]);
-	}
-
-	template<typename T=std::string>
-	T arg(std::string name, std::string longName="") {
-		consumeFlags();
-		if (index < argc) clearKeywords();
-		parsedCommand += std::string(" <") + name + ">";
-		argDetails.push_back(Keywords{name, longName, false});
-
-		if (index >= argc) {
-			if (longName.length() > 0) {
-				setError("Missing " + longName + " <" + name + ">");
-			} else {
-				setError("Missing argument <" + name + ">");
-			}
-			return T();
-		}
-
-		return valueFromString<T>(argv[index++]);
-	}
-
-	bool command(std::string keyword, std::string description="", bool isHelp=false) {
-		consumeFlags();
-		if (index == 1) {
-			helpCommand();
-		}
-		if (index < argc && !keyword.compare(argv[index])) {
-			clearKeywords();
-			index++;
-			if (!isHelp) parsedCommand += std::string(" ") + keyword;
-			return true;
-		}
-		keywordOptions.push_back(Keywords{keyword, description, isHelp});
-		return false;
-	}
-	bool helpCommand(std::string keyword="help") {
-		if (!checkedHelpCommand && index == 1) {
-			keywordOptions.push_back(Keywords{keyword, "", true});
-			if (index < argc && !keyword.compare(argv[index])) {
-				index++;
-				helpMode = true;
-			}
-		}
-		checkedHelpCommand = true;
-		return helpMode;
-	}
-
-	template<typename T=std::string>
-	T flag(std::string key, std::string description, T defaultValue) {
-		consumeFlags();
-		if (!hasFlag(key, description)) return defaultValue;
-
-		auto iterator = flagMap.find(key);
-		return valueFromString<T>(iterator->second.c_str());
-	}
-	template<typename T=std::string>
-	T flag(std::string key, T defaultValue) {
-		consumeFlags();
-		if (!hasFlag(key, "")) return defaultValue;
-
-		auto iterator = flagMap.find(key);
-		return valueFromString<T>(iterator->second.c_str());
-	}
-	template<typename T=std::string>
-	T flag(std::string key) {
-		return flag<T>(key, T());
-	}
-	bool hasFlag(std::string key, std::string description="") {
-		consumeFlags();
-		auto iterator = flagSet.find(key);
-		if (iterator == flagSet.end()) {
-			flagSet.insert(key);
-			flagOptions.push_back(Keywords{key, description, false});
-		} else if (description.length() > 0) {
-			bool found = false;
-			for (auto &option : flagOptions) {
-				if (option.keyword == key) {
-					option.description = description;
-					found = true;
-					break;
-				}
-			}
-			if (!found) {
-				flagOptions.push_back(Keywords{key, description, false});
-			}
-		}
-
-		auto mapIterator = flagMap.find(key);
-		return mapIterator != flagMap.end();
-	}
-	bool helpFlag(std::string key, std::string description="shows this help") {
-		consumeFlags();
-		hasFlag(key, description);
-		auto iterator = flagMap.find(key);
-		helpMode = (iterator != flagMap.end());
-		return helpMode;
-	}
-};
-
-template<>
-std::string SimpleArgs::valueFromString(const char *arg) {
-	return arg;
-}
-template<>
-const char * SimpleArgs::valueFromString(const char *arg) {
-	return arg;
-}
-template<>
-int SimpleArgs::valueFromString(const char *arg) {
-	return std::stoi(arg);
-}
-template<>
-long SimpleArgs::valueFromString(const char *arg) {
-	return std::stol(arg);
-}
-template<>
-unsigned long SimpleArgs::valueFromString(const char *arg) {
-	return std::stoul(arg);
-}
-template<>
-float SimpleArgs::valueFromString(const char *arg) {
-	return std::stof(arg);
-}
-template<>
-double SimpleArgs::valueFromString(const char *arg) {
-	return std::stod(arg);
-}
--- a/cmd/util/stop-denormals.h
+++ b/cmd/util/stop-denormals.h
@ -1,34 +0,0 @@
-#pragma once
-
-#if defined(__SSE__) || defined(_M_X64)
-	class StopDenormals {
-		unsigned int controlStatusRegister;
-	public:
-		StopDenormals() : controlStatusRegister(_mm_getcsr()) {
-			_mm_setcsr(controlStatusRegister|0x8040); // Flush-to-Zero and Denormals-Are-Zero
-		}
-		~StopDenormals() {
-			_mm_setcsr(controlStatusRegister);
-		}
-	};
-#elif (defined (__ARM_NEON) || defined (__ARM_NEON__))
-	class StopDenormals {
-		uintptr_t status;
-	public:
-		StopDenormals() {
-			uintptr_t asmStatus;
-			asm volatile("mrs %0, fpcr" : "=r"(asmStatus));
-			status = asmStatus = asmStatus|0x01000000U; // Flush to Zero
-			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
-		}
-		~StopDenormals() {
-			uintptr_t asmStatus = status;
-			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
-		}
-	};
-#else
-#	if __cplusplus >= 202302L
-# 		warning "The `StopDenormals` class doesn't do anything for this architecture"
-#	endif
-	class StopDenormals {}; // FIXME: add for other architectures
-#endif
--- a/cmd/util/stopwatch.h
+++ b/cmd/util/stopwatch.h
@ -1,107 +0,0 @@
-#ifndef SIGNALSMITH_STOPWATCH_UTIL_H
-#define SIGNALSMITH_STOPWATCH_UTIL_H
-
-#include <limits>
-#include <cmath>
-#include <atomic>
-#include <algorithm>
-
-#ifdef WINDOWS // completely untested!
-#	include <windows.h>
-namespace signalsmith {
-class Stopwatch {
-	using Time = __int64;
-	using Duration = Time;
-	inline Time now() {
-		LARGE_INTEGER result;
-		QueryPerformanceCounter(&result);
-		return result.QuadPart;
-	}
-	static double toSeconds(Duration t) {
-		LARGE_INTEGER freq;
-		QueryPerformanceFrequency(&freq);
-		return t/double(freq);
-	}
-#else
-#	include <chrono>
-namespace signalsmith {
-class Stopwatch {
-	using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
-	using Time = Clock::time_point;
-	using Duration = std::chrono::duration<double>;
-	
-	inline Time now() {
-		return Clock::now();
-	}
-	static double toSeconds(Duration duration) {
-		return duration.count();
-	}
-#endif
-
-	std::atomic<Time> lapStart; // the atomic store/load should act as barriers for reordering operations
-	double lapBest, lapTotal, lapTotal2;
-	double lapOverhead = 0;
-	int lapCount = 0;
-	
-public:
-	Stopwatch(bool compensate=true) {
-		if (compensate) {
-			start();
-			const int repeats = 1000;
-			for (int i = 0; i < repeats; ++i) {
-				startLap();
-				lap();
-			}
-			lapOverhead = (double)lapTotal/lapCount;
-		}
-		start();
-	}
-	// Explicit because std::atomic<> can't be copied/moved
-	Stopwatch(const Stopwatch &other) : lapBest(other.lapBest), lapTotal(other.lapTotal), lapTotal2(other.lapTotal2), lapOverhead(other.lapOverhead), lapCount(other.lapCount) {
-		lapStart.store(other.lapStart.load());
-	}
-
-	void start() {
-		lapCount = 0;
-		lapTotal = lapTotal2 = 0;
-		lapBest = std::numeric_limits<double>::max();
-		startLap();
-	}
-	void startLap() {
-		lapStart.store(now());
-	}
-	double lap() {
-		double diff = toSeconds(now() - lapStart.load());
-
-		if (diff < lapBest) lapBest = diff;
-		lapCount++;
-		lapTotal += diff;
-		lapTotal2 += diff*diff;
-
-		startLap();
-		return diff;
-	}
-	double total() const {
-		return std::max(0.0, lapTotal - lapCount*lapOverhead);
-	}
-	double mean() const {
-		return total()/lapCount;
-	}
-	double var() const {
-		double m = (double)lapTotal/lapCount, m2 = (double)lapTotal2/lapCount;
-		return std::max(0.0, m2 - m*m);
-	}
-	double std() const {
-		return sqrt(var());
-	}
-	double best() const {
-		return std::max(0.0, lapBest - lapOverhead);
-	}
-	double optimistic(double deviations=1) const {
-		return std::max(best(), mean() - std()*deviations);
-	}
-};
-
-} //namespace
-
-#endif // include guard
--- a/cmd/util/wav.h
+++ b/cmd/util/wav.h
@ -1,255 +0,0 @@
-#ifndef RIFF_WAVE_H_
-#define RIFF_WAVE_H_
-
-#include <vector>
-#include <iostream>
-#include <fstream>
-
-// TODO: something better here that doesn't assume little-endian architecture
-template<bool littleEndian=true>
-struct BigEndian {
-	static uint32_t read16(std::istream& in) {
-		unsigned char a[2];
-		in.read((char*)a, sizeof(a));
-		return ((uint32_t)a[0]) + ((uint32_t)a[1])*256;
-	}
-	static uint32_t read32(std::istream& in) {
-		unsigned char a[4];
-		in.read((char*)a, sizeof(a));
-		return ((uint32_t)a[0]&0xff) + ((uint32_t)a[1])*256 + ((uint32_t)a[2])*65536 + ((uint32_t)a[3])*256*65536;
-	}
-	
-	static void write16(std::ostream& out, uint16_t value) {
-		char a[2] = {(char)(value>>0), (char)(value>>8)};
-		out.write(a, sizeof(a));
-	}
-	static void write32(std::ostream& out, uint32_t value) {
-		char a[4] = {(char)(value>>0), (char)(value>>8), (char)(value>>16), (char)(value>>24)};
-		out.write(a, sizeof(a));
-	}
-};
-
-class Wav : BigEndian<true> {
-	// Little-endian versions of text values
-	uint32_t value_RIFF = 0x46464952;
-	uint32_t value_WAVE = 0x45564157;
-	uint32_t value_fmt = 0x20746d66;
-	uint32_t value_data = 0x61746164;
-
-	using BigEndian<true>::read16;
-	using BigEndian<true>::read32;
-	using BigEndian<true>::write16;
-	using BigEndian<true>::write32;
-
-public:
-	struct Result {
-		enum class Code {
-			OK = 0,
-			IO_ERROR,
-			FORMAT_ERROR,
-			UNSUPPORTED,
-			WEIRD_CONFIG
-		};
-		Code code = Code::OK;
-		std::string reason;
-		
-		Result(Code code, std::string reason="") : code(code), reason(reason) {};
-		// Used to neatly test for success
-		explicit operator bool () const {
-			return code == Code::OK;
-		};
-		const Result & warn(std::ostream& output=std::cerr) const {
-			if (!(bool)*this) {
-				output << "WAV error: " << reason << std::endl;
-			}
-			return *this;
-		}
-	};
-	
-	size_t sampleRate = 48000;
-	size_t channels = 1, offset = 0;
-	std::vector<double> samples;
-	size_t length() const {
-		size_t perChannel = samples.size()/channels;
-		return (perChannel >= offset) ? perChannel - offset : 0;
-	}
-	void resize(size_t length) {
-		samples.resize((offset + length)*channels, 0);
-	}
-	template<bool isConst>
-	class ChannelReader {
-		using CSample = typename std::conditional<isConst, const double, double>::type;
-		CSample *data;
-		size_t stride;
-	public:
-		ChannelReader(CSample *samples, size_t channels) : data(samples), stride(channels) {}
-		
-		CSample & operator [](size_t i) {
-			return data[i*stride];
-		}
-	};
-	ChannelReader<false> operator [](size_t c) {
-		return ChannelReader<false>(samples.data() + offset*channels + c, channels);
-	}
-	ChannelReader<true> operator [](size_t c) const {
-		return ChannelReader<true>(samples.data() + offset*channels + c, channels);
-	}
-	
-	Result result = Result(Result::Code::OK);
-
-	Wav() {}
-	Wav(double sampleRate, size_t channels) : sampleRate(sampleRate), channels(channels) {}
-	Wav(double sampleRate, size_t channels, const std::vector<double> &samples) : sampleRate(sampleRate), channels(channels), samples(samples) {}
-	Wav(std::string filename) {
-		result = read(filename).warn();
-	}
-	
-	enum class Format {
-		PCM=1
-	};
-	bool formatIsValid(uint16_t format, uint16_t bits) const {
-		if (format == (uint16_t)Format::PCM) {
-			if (bits == 16) {
-				return true;
-			}
-		}
-		return false;
-	}
-	
-	Result read(std::string filename) {
-		std::ifstream file;
-		file.open(filename, std::ios::binary);
-		if (!file.is_open()) return result = Result(Result::Code::IO_ERROR, "Failed to open file: " + filename);
-
-		// RIFF chunk
-		if (read32(file) != value_RIFF) return result = Result(Result::Code::FORMAT_ERROR, "Input is not a RIFF file");
-		read32(file); // File length - we don't check this
-		if (read32(file) != value_WAVE) return result = Result(Result::Code::FORMAT_ERROR, "Input is not a plain WAVE file");
-		
-		auto blockStart = file.tellg(); // start of the blocks - we will seek back to here periodically
-		bool hasFormat = false, hasData = false;
-		
-		Format format = Format::PCM; // Shouldn't matter, we should always read the `fmt ` chunk before `data`
-		while (!file.eof()) {
-			auto blockType = read32(file), blockLength = read32(file);
-			if (file.eof()) break;
-			if (!hasFormat && blockType == value_fmt) {
-				auto formatInt = read16(file);
-				format = (Format)formatInt;
-				channels = read16(file);
-				if (channels < 1) return result = Result(Result::Code::FORMAT_ERROR, "Cannot have zero channels");
-				
-				sampleRate = read32(file);
-				if (sampleRate < 1) return result = Result(Result::Code::FORMAT_ERROR, "Cannot have zero sampleRate");
-
-				size_t expectedBytesPerSecond = read32(file);
-				size_t bytesPerFrame = read16(file);
-				size_t bitsPerSample = read16(file);
-				if (!formatIsValid(formatInt, bitsPerSample)) return result = Result(Result::Code::UNSUPPORTED, "Unsupported format:bits: " + std::to_string(formatInt) + ":" + std::to_string(bitsPerSample));
-				// Since it's plain WAVE, we can do some extra checks for consistency
-				if (bitsPerSample*channels != bytesPerFrame*8) return result = Result(Result::Code::FORMAT_ERROR, "Format sizes don't add up");
-				if (expectedBytesPerSecond != sampleRate*bytesPerFrame) return result = Result(Result::Code::FORMAT_ERROR, "Format sizes don't add up");
-
-				hasFormat = true;
-				file.clear();
-				file.seekg(blockStart);
-			} else if (hasFormat && blockType == value_data) {
-				std::vector<double> samples(0);
-				switch (format) {
-				case Format::PCM:
-					samples.reserve(blockLength/2);
-					for (size_t i = 0; i < blockLength/2; ++i) {
-						uint16_t value = read16(file);
-						if (file.eof()) break;
-						if (value >= 32768) {
-							samples.push_back(((double)value - 65536)/32768);
-						} else {
-							samples.push_back((double)value/32768);
-						}
-					}
-				}
-				while (samples.size()%channels != 0) {
-					samples.push_back(0);
-				}
-				this->samples = samples;
-				offset = 0;
-				hasData = true;
-			} else {
-				// We either don't recognise
-				file.ignore(blockLength);
-			}
-		}
-		if (!hasFormat) return result = Result(Result::Code::FORMAT_ERROR, "missing `fmt ` block");
-		if (!hasData) return result = Result(Result::Code::FORMAT_ERROR, "missing `data` block");
-		return result = Result(Result::Code::OK);
-	}
-	
-	Result write(std::string filename, Format format=Format::PCM) {
-		if (channels == 0 || channels > 65535) return result = Result(Result::Code::WEIRD_CONFIG, "Invalid channel count");
-		if (sampleRate <= 0 || sampleRate > 0xFFFFFFFFu) return result = Result(Result::Code::WEIRD_CONFIG, "Invalid sample rate");
-		
-		std::ofstream file;
-		file.open(filename, std::ios::binary);
-		if (!file.is_open()) return result = Result(Result::Code::IO_ERROR, "Failed to open file: " + filename);
-		
-		size_t bytesPerSample;
-		switch (format) {
-		case Format::PCM:
-			bytesPerSample = 2;
-			break;
-		}
-		
-		// File size - 44 bytes is RIFF header, "fmt" block, and "data" block header
-		size_t dataLength = (samples.size() - offset*channels)*bytesPerSample;
-		size_t fileLength = 44 + dataLength;
-
-		// RIFF chunk
-		write32(file, value_RIFF);
-		write32(file, uint32_t(fileLength - 8)); // File length, excluding the RIFF header
-		write32(file, value_WAVE);
-		// "fmt " block
-		write32(file, value_fmt);
-		write32(file, 16); // block length
-		write16(file, uint16_t(format));
-		write16(file, uint16_t(channels));
-		write32(file, uint32_t(sampleRate));
-		size_t expectedBytesPerSecond = sampleRate*channels*bytesPerSample;
-		write32(file, uint32_t(expectedBytesPerSecond));
-		write16(file, uint16_t(channels*bytesPerSample)); // Bytes per frame
-		write16(file, uint16_t(bytesPerSample*8)); // bist per sample
-		
-		// "data" block
-		write32(file, value_data);
-		write32(file, uint32_t(dataLength));
-		switch (format) {
-		case Format::PCM:
-			for (size_t i = offset*channels; i < samples.size(); i++) {
-				double value = samples[i]*32768;
-				if (value > 32767) value = 32767;
-				if (value <= -32768) value = -32768;
-				if (value < 0) value += 65536;
-				write16(file, (uint16_t)value);
-			}
-			break;
-		}
-		return result = Result(Result::Code::OK);
-	}
-	
-	void makeMono() {
-		std::vector<double> newSamples(samples.size()/channels, 0);
-		
-		for (size_t channel = 0; channel < channels; ++channel) {
-			for (size_t i = 0; i < newSamples.size(); ++i) {
-				newSamples[i] += samples[i*channels + channel];
-			}
-		}
-		for (size_t i = 0; i < newSamples.size(); ++i) {
-			newSamples[i] /= channels;
-		}
-		
-		channels = 1;
-		samples = newSamples;
-	}
-};
-
-#endif // RIFF_WAVE_H_
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@ -33,17 +33,12 @@ namespace _impl {

 template<typename Sample=float, class RandomEngine=void>
 struct SignalsmithStretch {
-	static constexpr size_t version[3] = {1, 3, 1};
+	static constexpr size_t version[3] = {1, 3, 2};

 	SignalsmithStretch() : randomEngine(std::random_device{}()) {}
 	SignalsmithStretch(long seed) : randomEngine(seed) {}
 		
-	int blockSamples() const {
-		return int(stft.blockSamples());
-	}
-	int intervalSamples() const {
-		return int(stft.defaultInterval());
-	}
+	// The difference between the internal position (centre of a block) and the input samples you're supplying
 	int inputLatency() const {
 		return int(stft.analysisLatency());
 	}
@ -81,7 +76,6 @@ struct SignalsmithStretch {
 		stft.reset(0.1);
 		stashedInput = stft.input;
 		stashedOutput = stft.output;
-		tmpBuffer.resize(blockSamples + intervalSamples);

 		bands = int(stft.bands());
 		channelBands.assign(bands*channels, Band());
@ -94,6 +88,19 @@ struct SignalsmithStretch {

 		blockProcess = {};
 		formantMetric.resize(bands + 2);
+
+		tmpProcessBuffer.resize(blockSamples + intervalSamples);
+		tmpPreRollBuffer.resize(outputLatency()*channels);
+	}
+	// For querying the existing config
+	int blockSamples() const {
+		return int(stft.blockSamples());
+	}
+	int intervalSamples() const {
+		return int(stft.defaultInterval());
+	}
+	bool splitComputation() const {
+		return _splitComputation;
 	}

 	/// Frequency multiplier, and optional tonality limit (as multiple of sample-rate)
@ -127,14 +134,15 @@ struct SignalsmithStretch {
 		formantBaseFreq = baseFreq;
 	}
 	
-	// Provide previous input ("pre-roll"), without affecting the speed calculation.  You should ideally feed it one block-length + one interval
+	// Provide previous input ("pre-roll") to smoothly change the input location without interrupting the output.  This doesn't do any calculation, just copies intput to a buffer.
+	// You should ideally feed it `seekLength()` frames of input, unless it's directly after a `.reset()` (in which case `.outputSeek()` might be a better choice)
 	template<class Inputs>
 	void seek(Inputs &&inputs, int inputSamples, double playbackRate) {
-		tmpBuffer.resize(0);
-		tmpBuffer.resize(stft.blockSamples() + stft.defaultInterval());
+		tmpProcessBuffer.resize(0);
+		tmpProcessBuffer.resize(stft.blockSamples() + stft.defaultInterval());

-		int startIndex = std::max<int>(0, inputSamples - int(tmpBuffer.size())); // start position in input
-		int padStart = int(tmpBuffer.size() + startIndex) - inputSamples; // start position in tmpBuffer
+		int startIndex = std::max<int>(0, inputSamples - int(tmpProcessBuffer.size())); // start position in input
+		int padStart = int(tmpProcessBuffer.size() + startIndex) - inputSamples; // start position in tmpProcessBuffer

 		Sample totalEnergy = 0;
 		for (int c = 0; c < channels; ++c) {
@ -142,12 +150,12 @@ struct SignalsmithStretch {
 			for (int i = startIndex; i < inputSamples; ++i) {
 				Sample s = inputChannel[i];
 				totalEnergy += s*s;
-				tmpBuffer[i - startIndex + padStart] = s;
+				tmpProcessBuffer[i - startIndex + padStart] = s;
 			}
 			
-			stft.writeInput(c, tmpBuffer.size(), tmpBuffer.data());
+			stft.writeInput(c, tmpProcessBuffer.size(), tmpProcessBuffer.data());
 		}
-		stft.moveInput(tmpBuffer.size());
+		stft.moveInput(tmpProcessBuffer.size());
 		if (totalEnergy >= noiseFloor) {
 			silenceCounter = 0;
 			silenceFirst = true;
@ -155,6 +163,48 @@ struct SignalsmithStretch {
 		didSeek = true;
 		seekTimeFactor = (playbackRate*stft.defaultInterval() > 1) ? 1/playbackRate : stft.defaultInterval();
 	}
+	int seekLength() const {
+		return int(stft.blockSamples() + stft.defaultInterval());
+	}
+	
+	// Moves the input position *and* pre-calculates some output, so that the next samples returned from `.process()` are aligned to the beginning of the sample.
+	// The time-stretch rate is inferred from `inputLength`, so use `.outputSeekLength()` to get a correct value for that.
+	template<class Inputs>
+	void outputSeek(Inputs &&inputs, int inputLength) {
+		// TODO: add fade-out parameter to avoid clicks, instead of doing a full reset
+		reset();
+		// Assume we've been handed enough surplus input to produce `outputLatency()` samples of pre-roll
+		int surplusInput = std::max<int>(inputLength - inputLatency(), 0);
+		Sample playbackRate = surplusInput/Sample(outputLatency());
+
+		// Move the input position to the start of the sound
+		int seekSamples = inputLength - surplusInput;
+		seek(inputs, seekSamples, playbackRate);
+		
+		tmpPreRollBuffer.resize(outputLatency()*channels);
+		struct BufferOutput {
+			Sample *samples;
+			int length;
+			
+			Sample * operator[](int c) {
+				return samples + c*length;
+			}
+		} preRollOutput{tmpPreRollBuffer.data(), outputLatency()};
+		
+		// Use the surplus input to produce pre-roll output
+		OffsetIO<Inputs> offsetInput{inputs, seekSamples};
+		process(offsetInput, surplusInput, preRollOutput, preRollOutput.length);
+		
+		// put the thing down, flip it and reverse it
+		for (auto &v : tmpPreRollBuffer) v = -v;
+		for (int c = 0; c < channels; ++c) {
+			std::reverse(preRollOutput[c], preRollOutput[c] + preRollOutput.length);
+			stft.addOutput(c, preRollOutput.length, preRollOutput[c]);
+		}
+	}
+	int outputSeekLength(Sample playbackRate) const {
+		return inputLatency() + playbackRate*outputLatency();
+	}

 	template<class Inputs, class Outputs>
 	void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
@ -165,14 +215,14 @@ struct SignalsmithStretch {
 		auto copyInput = [&](int toIndex){

 			int length = std::min<int>(int(stft.blockSamples() + stft.defaultInterval()), toIndex - prevCopiedInput);
-			tmpBuffer.resize(length);
+			tmpProcessBuffer.resize(length);
 			int offset = toIndex - length;
 			for (int c = 0; c < channels; ++c) {
 				auto &&inputBuffer = inputs[c];
 				for (int i = 0; i < length; ++i) {
-					tmpBuffer[i] = inputBuffer[i + offset];
+					tmpProcessBuffer[i] = inputBuffer[i + offset];
 				}
-				stft.writeInput(c, length, tmpBuffer.data());
+				stft.writeInput(c, length, tmpProcessBuffer.data());
 			}
 			stft.moveInput(length);
 			prevCopiedInput = toIndex;
@ -372,28 +422,38 @@ struct SignalsmithStretch {
 #endif
 	}

-	// Read the remaining output, providing no further input.  `outputSamples` should ideally be at least `.outputLatency()`
+	// Read the remaining output, providing no further input.  If `outputSamples` is more than one interval, it will compute additional blocks assuming a zero-valued input
 	template<class Outputs>
-	void flush(Outputs &&outputs, int outputSamples) {
-		int plainOutput = std::min<int>(outputSamples, int(stft.blockSamples()));
-		int foldedBackOutput = std::min<int>(outputSamples, int(stft.blockSamples()) - plainOutput);
+	void flush(Outputs &&outputs, int outputSamples, Sample playbackRate=0) {
+		struct Zeros {
+			struct Channel {
+				Sample operator[](int) {
+					return 0;
+				}
+			};
+			Channel operator[](int) {
+				return {};
+			}
+		} zeros;
+		// If we're asked for more than an interval of extra output, then zero-pad the input
+		int outputBlock = std::max<int>(0, outputSamples - stft.defaultInterval());
+		if (outputBlock > 0) process(zeros, outputBlock*playbackRate, outputs, outputBlock);
+
+		int tailSamples = outputSamples - outputBlock; // at most one interval
+		tmpProcessBuffer.resize(tailSamples);
 		stft.finishOutput(1);
 		for (int c = 0; c < channels; ++c) {
-			tmpBuffer.resize(plainOutput);
-			stft.readOutput(c, plainOutput, tmpBuffer.data());
+			stft.readOutput(c, tailSamples, tmpProcessBuffer.data());
 			auto &&outputChannel = outputs[c];
-			for (int i = 0; i < plainOutput; ++i) {
-				// TODO: plain output should be gain-
-				outputChannel[i] = tmpBuffer[i];
+			for (int i = 0; i < tailSamples; ++i) {
+				outputChannel[outputBlock + i] = tmpProcessBuffer[i];
 			}
-			tmpBuffer.resize(foldedBackOutput);
-			stft.readOutput(c, plainOutput, foldedBackOutput, tmpBuffer.data());
-			for (int i = 0; i < foldedBackOutput; ++i) {
-				outputChannel[outputSamples - 1 - i] -= tmpBuffer[i];
+			stft.readOutput(c, tailSamples, tailSamples, tmpProcessBuffer.data());
+			for (int i = 0; i < tailSamples; ++i) {
+				outputChannel[outputBlock + tailSamples - 1 - i] -= tmpProcessBuffer[i];
 			}
 		}
-		stft.reset(0.1);
-
+		stft.reset(0.1f);
 		// Reset the phase-vocoder stuff, so the next block gets a fresh start
 		for (int c = 0; c < channels; ++c) {
 			auto channelBands = bandsForChannel(c);
@ -403,65 +463,30 @@ struct SignalsmithStretch {
 		}
 	}

+	// Process a complete audio buffer all in one go
 	template<class Inputs, class Outputs>
 	bool exact(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
-		if (outputSamples < outputLatency()*2) return false; // too short for this
-
-		struct ZeroPaddedInput {
-			Inputs &inputs;
-			int offset, length;
-			
-			struct Channel {
-				ZeroPaddedInput &zpi;
-				int channel;
-				
-				Sample operator[](int i) {
-					if (zpi.offset + i < zpi.length) return zpi.inputs[channel][zpi.offset + i];
-					return 0;
+		Sample playbackRate = inputSamples/Sample(outputSamples);
+		auto seekLength = outputSeekLength(playbackRate);
+		if (inputSamples < seekLength) {
+			// to short for this - zero the output just to be polite
+			for (int c = 0; c < channels; ++c) {
+				auto &&channel = outputs[c];
+				for (int i = 0; i < outputSamples; ++i) {
+					channel[i] = 0;
 				}
-			};
-			
-			Channel operator[](int c){
-				return {*this, c};
-			}
-		} zpi{inputs, inputLatency(), inputSamples};
-		seek(inputs, inputLatency(), Sample(inputSamples)/outputSamples); // start positioned on the centre of the input
-		process(zpi, inputSamples, outputs, outputSamples);
-				
-		// Fold the first bit of the input back onto itself
-		for (int c = 0; c < channels; ++c) {
-			auto &&channel = outputs[c];
-			for (int i = 0; i < std::min<int>(outputSamples - outputLatency(), outputLatency()); ++i) {
-				channel[i + outputLatency()] -= channel[outputLatency() - 1 - i];
-			}
-		}
-		// Shuffle everything along to compensate for output latency
-		for (int c = 0; c < channels; ++c) {
-			auto &&channel = outputs[c];
-			for (int i = 0; i < outputSamples - outputLatency(); ++i) {
-				channel[i] = channel[i + outputLatency()];
 			}
+			return false;
 		}

-		struct OffsetOutput {
-			Outputs &outputs;
-			int offset;
+		outputSeek(inputs, seekLength);

-			struct Channel {
-				OffsetOutput &oo;
-				int channel;
+		int outputIndex = outputSamples - seekLength/playbackRate;
+		OffsetIO<Inputs> offsetInput{inputs, seekLength};
+		process(offsetInput, inputSamples - seekLength, outputs, outputIndex);
 		
-				decltype(outputs[0][0]) operator[](int i) {
-					return oo.outputs[channel][oo.offset + i];
-				}
-			};
-			
-			Channel operator[](int c){
-				return {*this, c};
-			}
-		} oo{outputs, outputSamples - outputLatency()};
-		// Get the final chunk - extra output is already folded back as part of this
-		flush(oo, outputLatency());
+		OffsetIO<Outputs> offsetOutput{outputs, outputIndex};
+		flush(offsetOutput, outputSamples - outputIndex, playbackRate);
 		return true;
 	}

@ -496,7 +521,7 @@ private:
 	typename STFT::Input stashedInput;
 	typename STFT::Output stashedOutput;
 	
-	std::vector<Sample> tmpBuffer;
+	std::vector<Sample> tmpProcessBuffer, tmpPreRollBuffer;

 	int channels = 0, bands = 0;
 	int prevInputOffset = -1;
@ -956,20 +981,27 @@ private:

 			freqEstimate = freqToBand(formantBaseFreq);
 			if (formantBaseFreq <= 0) freqEstimate = estimateFrequency();
-
-			for (int b = 0; b < bands; ++b) {
-				formantMetric[b] = std::sqrt(formantMetric[b]);
-			}
 		} else if (step-- == 0) {
-			Sample slew = 1/(freqEstimate*0.5 + 1);
+			Sample decay = 1 - 1/(freqEstimate*0.5 + 1);
 			Sample e = 0;
 			for (size_t repeat = 0; repeat < 2; ++repeat) {
 				for (int b = bands - 1; b >= 0; --b) {
-					e += (formantMetric[b] - e)*slew;
+					e = std::max(formantMetric[b], e*decay);
 					formantMetric[b] = e;
 				}
 				for (int b = 0; b < bands; ++b) {
-					e += (formantMetric[b] - e)*slew;
+					e = std::max(formantMetric[b], e*decay);
+					formantMetric[b] = e;
+				}
+			}
+			decay = 1/decay;
+			for (size_t repeat = 0; repeat < 2; ++repeat) {
+				for (int b = bands - 1; b >= 0; --b) {
+					e = std::min(formantMetric[b], e*decay);
+					formantMetric[b] = e;
+				}
+				for (int b = 0; b < bands; ++b) {
+					e = std::min(formantMetric[b], e*decay);
 					formantMetric[b] = e;
 				}
 			}
@ -992,7 +1024,7 @@ private:
 				Sample targetE = getFormant(freqToBand(outputF));

 				Sample formantRatio = targetE/(inputE + Sample(1e-30));
-				Sample energyRatio = formantRatio*formantRatio;
+				Sample energyRatio = formantRatio;

 				for (int c = 0; c < channels; ++c) {
 					Band *bins = bandsForChannel(c);
@ -1002,6 +1034,26 @@ private:
 			}
 		}
 	}
+
+	// Proxy class to avoid copying/allocating anything
+	template<class Io>
+	struct OffsetIO {
+		Io &io;
+		int offset;
+
+		struct Channel {
+			Io &io;
+			int channel;
+			int offset;
+			
+			auto operator[](int i) -> decltype(io[0][0]) {
+				return io[channel][i + offset];
+			}
+		};
+		Channel operator[](int c) {
+			return {io, c, offset};
+		}
+	};
 };

 }} // namespace
--- a/web/release/SignalsmithStretch.js
+++ b/web/release/SignalsmithStretch.js
@ -91,15 +91,11 @@ function registerWorkletProcessor(Module, audioNodeKey) {
 						latestSegment = this.timeMap.pop();
 					}

-					let obj = {
-						active: latestSegment.active,
+					let obj = Object.assign({}, latestSegment);
+					Object.assign(obj, {
 						input: null,
 						output: outputTime,
-						rate: latestSegment.rate,
-						semitones: latestSegment.semitones,
-						loopStart: latestSegment.loopStart,
-						loopEnd: latestSegment.loopEnd
-					};
+					});
 					Object.assign(obj, objIn);
 					if (obj.input === null) {
 						let rate = (latestSegment.active ? latestSegment.rate : 0);
--- a/web/release/SignalsmithStretch.mjs
+++ b/web/release/SignalsmithStretch.mjs
@ -92,15 +92,11 @@ function registerWorkletProcessor(Module, audioNodeKey) {
 						latestSegment = this.timeMap.pop();
 					}

-					let obj = {
-						active: latestSegment.active,
+					let obj = Object.assign({}, latestSegment);
+					Object.assign(obj, {
 						input: null,
 						output: outputTime,
-						rate: latestSegment.rate,
-						semitones: latestSegment.semitones,
-						loopStart: latestSegment.loopStart,
-						loopEnd: latestSegment.loopEnd
-					};
+					});
 					Object.assign(obj, objIn);
 					if (obj.input === null) {
 						let rate = (latestSegment.active ? latestSegment.rate : 0);
--- a/web/release/package.json
+++ b/web/release/package.json
@ -1,6 +1,6 @@
 {
 	"name": "signalsmith-stretch",
-	"version": "1.3.1",
+	"version": "1.3.2",
 	"description": "JS/WASM release of the Signalsmith Stretch library",
 	"main": "SignalsmithStretch.mjs",
 	"exports": {
--- a/web/web-wrapper.js
+++ b/web/web-wrapper.js
@ -72,15 +72,11 @@ function registerWorkletProcessor(Module, audioNodeKey) {
 						latestSegment = this.timeMap.pop();
 					}

-					let obj = {
-						active: latestSegment.active,
+					let obj = Object.assign({}, latestSegment);
+					Object.assign(obj, {
 						input: null,
 						output: outputTime,
-						rate: latestSegment.rate,
-						semitones: latestSegment.semitones,
-						loopStart: latestSegment.loopStart,
-						loopEnd: latestSegment.loopEnd
-					};
+					});
 					Object.assign(obj, objIn);
 					if (obj.input === null) {
 						let rate = (latestSegment.active ? latestSegment.rate : 0);
Author	SHA1	Message	Date
Geraint Luff	90d6c686eb	Add reflected pre-roll to `.outputSeek()`	2025-08-11 16:37:44 +01:00
Geraint Luff	2724daacaf	`.flush()` processes new output (zero-valued input) for longer lengths	2025-08-11 14:54:32 +01:00
Geraint	3e71aec5f7	Use `.outputSeek()` for `.exact()`	2025-08-10 21:15:04 +01:00
Geraint	c3fcda8563	Add `.outputSeek()` for playing back samples	2025-08-10 20:15:45 +01:00
Geraint	12de19e05d	Formant metric based on peaks/dips	2025-08-10 20:15:45 +01:00
Geraint Luff	53159860b1	Start `.outputSeek()` method	2025-08-10 20:15:23 +01:00
Geraint Luff	72a4c5e5bb	Start simpler command-line example	2025-08-10 11:51:40 +01:00
Geraint Luff	2312b26341	Update `signalsmith-linear` to 0.2.2 for PFFFT support	2025-08-09 21:29:21 +01:00
Geraint Luff	cb5a8eab7e	Zero `.exact()` output when it's too short	2025-08-05 08:00:13 +01:00
Geraint	222093b4cc	Bugfix in web release	2025-06-27 02:09:03 +01:00
				`@ -0,0 +1 @@`
				`Subproject commit aeb4e31077a453566b58fff1c7e7e998ac824157`