diff --git a/cmd/Makefile b/cmd/Makefile
index 6887a65..30fdfb2 100644
--- a/cmd/Makefile
+++ b/cmd/Makefile
@@ -11,19 +11,19 @@ out/stretch: main.cpp ../signalsmith-stretch.h util/*.h util/*.hxx
 # Uses input files from: https://signalsmith-audio.co.uk/code/stretch/inputs.zip
 examples: out/stretch
 	mkdir -p out/examples
-	inputs/run-all.sh out/examples/u2- out/stretch --semitones=2
-	inputs/run-all.sh out/examples/d2- out/stretch --semitones=-2
-	inputs/run-all.sh out/examples/u4- out/stretch --semitones=4
-	inputs/run-all.sh out/examples/d4- out/stretch --semitones=-4
-	inputs/run-all.sh out/examples/u8- out/stretch --semitones=8
-	inputs/run-all.sh out/examples/d8- out/stretch --semitones=-8
-	inputs/run-all.sh out/examples/u16- out/stretch --semitones=16
-	inputs/run-all.sh out/examples/d16- out/stretch --semitones=-16
-	inputs/run-all.sh out/examples/t_8- out/stretch --time=0.8
-	inputs/run-all.sh out/examples/t1_2- out/stretch --time=1.2
-	inputs/run-all.sh out/examples/t1_5- out/stretch --time=1.5
-	inputs/run-all.sh out/examples/t2- out/stretch --time=2
-	inputs/run-all.sh out/examples/t4- out/stretch --time=4
+	inputs/run-all.sh out/examples/u2- out/stretch --semitones=2 --exact
+	inputs/run-all.sh out/examples/d2- out/stretch --semitones=-2 --exact
+	inputs/run-all.sh out/examples/u4- out/stretch --semitones=4 --exact
+	inputs/run-all.sh out/examples/d4- out/stretch --semitones=-4 --exact
+	inputs/run-all.sh out/examples/u8- out/stretch --semitones=8 --exact
+	inputs/run-all.sh out/examples/d8- out/stretch --semitones=-8 --exact
+	inputs/run-all.sh out/examples/u16- out/stretch --semitones=16 --exact
+	inputs/run-all.sh out/examples/d16- out/stretch --semitones=-16 --exact
+	inputs/run-all.sh out/examples/t_8- out/stretch --time=0.8 --exact
+	inputs/run-all.sh out/examples/t1_2- out/stretch --time=1.2 --exact
+	inputs/run-all.sh out/examples/t1_5- out/stretch --time=1.5 --exact
+	inputs/run-all.sh out/examples/t2- out/stretch --time=2 --exact
+	inputs/run-all.sh out/examples/t4- out/stretch --time=4 --exact
 
 TEST_WAV ?= "inputs/voice.wav"
 
diff --git a/cmd/main.cpp b/cmd/main.cpp
index 8cdec21..af3435b 100644
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@@ -123,26 +123,34 @@ int main(int argc, char* argv[]) {
 		std::cout << "\tallocated " << (initMemory.allocBytes/1000) << "kB, freed " << (initMemory.freeBytes/1000) << "kB\n";
 	}
 
-	// pad the input at the end, since we'll be reading slightly ahead
-	size_t paddedInputLength = inputLength + stretch.inputLatency();
-	inWav.samples.resize(paddedInputLength*inWav.channels);
-	// pad the output at the end, since we have output latency as well
-	int tailSamples = exactLength ? stretch.outputLatency() : (stretch.outputLatency() + stretch.inputLatency()); // if we don't need exact length, add a bit more output to catch any wobbles past the end
-	int paddedOutputLength = outputLength + tailSamples;
-	outWav.samples.resize(paddedOutputLength*outWav.channels);
-
 	signalsmith::MemoryTracker processMemory;
 
-	stopwatch.start();
-	// The simplest way to deal with input latency (when have access to the audio buffer) is to always be slightly ahead in the input
-	stretch.seek(inWav, stretch.inputLatency(), 1/time);
-	inWav.offset += stretch.inputLatency();
-	// Process it all in one call, although it works just the same if we split into smaller blocks
-	stretch.process(inWav, int(inputLength), outWav, int(outputLength));
-	// Read the last bit of output without giving it any more input
-	outWav.offset += outputLength;
-	stretch.flush(outWav, tailSamples);
-	outWav.offset -= outputLength;
+	if (exactLength) {
+		outWav.samples.resize(outputLength*outWav.channels);
+		stopwatch.start();
+		processMemory = {};
+		stretch.exact(inWav, int(inputLength), outWav, int(outputLength));
+	} else {
+		// pad the input at the end, since we'll be reading slightly ahead
+		size_t paddedInputLength = inputLength + stretch.inputLatency();
+		inWav.samples.resize(paddedInputLength*inWav.channels);
+		// pad the output at the end, since we have output latency as well
+		int tailSamples = exactLength ? stretch.outputLatency() : (stretch.outputLatency() + stretch.inputLatency()); // if we don't need exact length, add a bit more output to catch any wobbles past the end
+		int paddedOutputLength = outputLength + tailSamples;
+		outWav.samples.resize(paddedOutputLength*outWav.channels);
+
+		stopwatch.start();
+		// The simplest way to deal with input latency (when have access to the audio buffer) is to always be slightly ahead in the input
+		stretch.seek(inWav, stretch.inputLatency(), 1/time);
+		inWav.offset += stretch.inputLatency();
+		// Process it all in one call, although it works just the same if we split into smaller blocks
+		processMemory = {};
+		stretch.process(inWav, int(inputLength), outWav, int(outputLength));
+		// Read the last bit of output without giving it any more input
+		outWav.offset += outputLength;
+		stretch.flush(outWav, tailSamples);
+		outWav.offset -= outputLength;
+	}
 
 	double processSeconds = stopwatch.lap();
 	double processRate = (inWav.length()/inWav.sampleRate)/processSeconds;
@@ -154,20 +162,6 @@ int main(int argc, char* argv[]) {
 		if (processMemory) args.errorExit("allocated during process()");
 	}
 	
-	if (exactLength) {
-		// The start has some extra output - we could just trim it, but we might as well fold it back into the output
-		for (size_t c = 0; c < outWav.channels; ++c) {
-			for (int i = 0; i < stretch.outputLatency(); ++i) {
-				double trimmed = outWav[stretch.outputLatency() - 1 - i][c];
-				outWav[stretch.outputLatency() + i][c] -= trimmed; // reversed in time and negated
-			}
-		}
-		// Skips the output
-		outWav.offset += stretch.outputLatency();
-
-		// the `.flush()` call already handled foldback stuff at the end (since we asked for a shorter `tailSamples`)
-	}
-
 #ifdef PROFILE_PLOT_CHUNKS
 	signalsmith::plot::Figure figure;
 	auto &plot = figure(0, 0).plot(400, 150);
diff --git a/signalsmith-stretch.h b/signalsmith-stretch.h
index 024dcbb..6a0a869 100644
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@@ -402,6 +402,69 @@ struct SignalsmithStretch {
 			}
 		}
 	}
+
+	template<class Inputs, class Outputs>
+	bool exact(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
+		if (outputSamples < outputLatency()*2) return false; // too short for this
+
+		struct ZeroPaddedInput {
+			Inputs &inputs;
+			int offset, length;
+			
+			struct Channel {
+				ZeroPaddedInput &zpi;
+				int channel;
+				
+				Sample operator[](int i) {
+					if (zpi.offset + i < zpi.length) return zpi.inputs[channel][zpi.offset + i];
+					return 0;
+				}
+			};
+			
+			Channel operator[](int c){
+				return {*this, c};
+			}
+		} zpi{inputs, inputLatency(), inputSamples};
+		seek(inputs, inputLatency(), Sample(inputSamples)/outputSamples); // start positioned on the centre of the input
+		process(zpi, inputSamples, outputs, outputSamples);
+				
+		// Fold the first bit of the input back onto itself
+		for (int c = 0; c < channels; ++c) {
+			auto &&channel = outputs[c];
+			for (int i = 0; i < std::min<int>(outputSamples - outputLatency(), outputLatency()); ++i) {
+				channel[i + outputLatency()] -= channel[outputLatency() - 1 - i];
+			}
+		}
+		// Shuffle everything along to compensate for output latency
+		for (int c = 0; c < channels; ++c) {
+			auto &&channel = outputs[c];
+			for (int i = 0; i < outputSamples - outputLatency(); ++i) {
+				channel[i] = channel[i + outputLatency()];
+			}
+		}
+
+		struct OffsetOutput {
+			Outputs &outputs;
+			int offset;
+			
+			struct Channel {
+				OffsetOutput &oo;
+				int channel;
+				
+				decltype(outputs[0][0]) operator[](int i) {
+					return oo.outputs[channel][oo.offset + i];
+				}
+			};
+			
+			Channel operator[](int c){
+				return {*this, c};
+			}
+		} oo{outputs, outputSamples - outputLatency()};
+		// Get the final chunk - extra output is already folded back as part of this
+		flush(oo, outputLatency());
+		return true;
+	}
+
 private:
 	bool _splitComputation = false;
 	struct {