Split .processSpectrum() into more steps

2025-02-21 14:47:56 +00:00 · 2025-02-21 14:47:56 +00:00 · 46d866e9fe
commit 46d866e9fe
parent f72c4f0985
5 changed files with 338 additions and 190 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -7,7 +7,7 @@ include(FetchContent)
 FetchContent_Declare(
 	signalsmith-linear
 	GIT_REPOSITORY https://github.com/Signalsmith-Audio/linear.git
-	GIT_TAG c600e0420d260469566c41e1ccb64f89ee439dd3
+	GIT_TAG 0.1.0
 	GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(signalsmith-linear)
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@ -2,6 +2,16 @@
 #include <iostream>
 #define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << "\n";
 size_t activeStepIndex = 0;
 void profileProcessStart(int, int);
 void profileProcessEndStep();
 void profileProcessStep(size_t, size_t);
 void profileProcessEnd();
 #define SIGNALSMITH_STRETCH_PROFILE_PROCESS_START profileProcessStart
 #define SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP profileProcessStep
 #define SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP profileProcessEndStep
 #define SIGNALSMITH_STRETCH_PROFILE_PROCESS_END profileProcessEnd
 #include "signalsmith-stretch/signalsmith-stretch.h"
 #include "./util/stopwatch.h"
@ -9,8 +19,43 @@
 #include "./util/simple-args.h"
 #include "./util/wav.h"
 #include "plot/plot.h"
 std::vector<signalsmith::Stopwatch> processStopwatches;
 signalsmith::Stopwatch processStopwatchStart, processStopwatchEnd;
 bool started = false;
 bool activeStep = false;
 void profileProcessStart(int /*inputSamples*/, int /*outputSamples*/) {
 	activeStep = false;
 	started = true;
 	processStopwatchStart.startLap();
 }
 void profileProcessEndStep() {
 	if (activeStep) {
 		activeStep = false;
 		processStopwatches[activeStepIndex].lap();
 	} else if (started) {
 		started = false;
 		processStopwatchStart.lap();
 	}
 	processStopwatchEnd.startLap();
 }
 void profileProcessStep(size_t step, size_t count) {
 	profileProcessEndStep();
 	activeStep = true;
 	activeStepIndex = step;
 	if (processStopwatches.size() < count) {
 		processStopwatches.resize(count);
 	}
 	processStopwatches[step].startLap();
 }
 void profileProcessEnd() {
 	processStopwatchEnd.lap();
 }
 int main(int argc, char* argv[]) {
 	signalsmith::stretch::SignalsmithStretch<float/*, std::ranlux48_base*/> stretch; // optional cheaper RNG for performance comparison
 	processStopwatches.reserve(1000);
 	SimpleArgs args(argc, argv);
@ -56,7 +101,7 @@ int main(int argc, char* argv[]) {
 	stopwatch.start();
 	stretch.presetDefault(inWav.channels, inWav.sampleRate);
 	stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
-	double initSeconds = stopwatch.seconds(stopwatch.lap());
+	double initSeconds = stopwatch.lap();
 	initMemory = initMemory.diff();
 	std::cout << "Setup:\n\t" << initSeconds << "s\n";
@ -85,7 +130,7 @@ int main(int argc, char* argv[]) {
 	stretch.flush(outWav, tailSamples);
 	outWav.offset -= outputLength;
-	double processSeconds = stopwatch.seconds(stopwatch.lap());
+	double processSeconds = stopwatch.lap();
 	double processRate = (inWav.length()/inWav.sampleRate)/processSeconds;
 	double processPercent = 100/processRate;
 	processMemory = processMemory.diff();
@ -109,6 +154,31 @@ int main(int argc, char* argv[]) {
 		// the `.flush()` call already handled foldback stuff at the end (since we asked for a shorter `tailSamples`)
 	}
 	signalsmith::plot::Plot2D plot(400, 150);
 	plot.x.major(0, "").label("step");
 	plot.y.major(0).label("time spent");
 	auto &line = plot.line().fillToY(0);
 	auto &extraLine = plot.line().fillToY(0);
 	for (size_t i = 0; i < processStopwatches.size(); ++i) {
 		double time = processStopwatches[i].total();
 		if (i%5 == 0) {
 			plot.x.tick(i + 0.5, std::to_string(i));
 		} else {
 			plot.x.tick(i + 0.5, "");
 		}
 		line.add(i, time);
 		line.add(i + 1, time);
 	}
 	extraLine.add(-1, 0);
 	extraLine.add(-1, processStopwatchStart.total());
 	extraLine.add(0, processStopwatchStart.total());
 	extraLine.add(0, 0);
 	extraLine.add(processStopwatches.size(), 0);
 	extraLine.add(processStopwatches.size(), processStopwatchEnd.total());
 	extraLine.add(processStopwatches.size() + 1, processStopwatchEnd.total());
 	extraLine.add(processStopwatches.size() + 1, 0);
 	plot.write("profile.svg");
 	if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV");
 	if (compareReference && prevWav.result) {
--- a/cmd/util/stop-denormals.h
+++ b/cmd/util/stop-denormals.h
@ -0,0 +1,34 @@
 #pragma once
 #if defined(__SSE__) || defined(_M_X64)
 	class StopDenormals {
 		unsigned int controlStatusRegister;
 	public:
 		StopDenormals() : controlStatusRegister(_mm_getcsr()) {
 			_mm_setcsr(controlStatusRegister|0x8040); // Flush-to-Zero and Denormals-Are-Zero
 		}
 		~StopDenormals() {
 			_mm_setcsr(controlStatusRegister);
 		}
 	};
 #elif (defined (__ARM_NEON) || defined (__ARM_NEON__))
 	class StopDenormals {
 		uintptr_t status;
 	public:
 		StopDenormals() {
 			uintptr_t asmStatus;
 			asm volatile("mrs %0, fpcr" : "=r"(asmStatus));
 			status = asmStatus = asmStatus|0x01000000U; // Flush to Zero
 			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
 		}
 		~StopDenormals() {
 			uintptr_t asmStatus = status;
 			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
 		}
 	};
 #else
 #	if __cplusplus >= 202302L
 # 		warning "The `StopDenormals` class doesn't do anything for this architecture"
 #	endif
 	class StopDenormals {}; // FIXME: add for other architectures
 #endif
--- a/cmd/util/stopwatch.h
+++ b/cmd/util/stopwatch.h
@ -6,37 +6,40 @@
 #include <atomic>
 #include <algorithm>
-// We want CPU time, not wall-clock time, so we can't use `std::chrono::high_resolution_clock`
+#ifdef WINDOWS // completely untested!
 #ifdef WINDOWS
 #	include <windows.h>
 namespace signalsmith {
 class Stopwatch {
 	using Time = __int64;
 	using Duration = Time;
 	inline Time now() {
 		LARGE_INTEGER result;
 		QueryPerformanceCounter(&result);
 		return result.QuadPart;
 	}
-	static double timeToSeconds(double t) {
+	static double toSeconds(Duration t) {
 		LARGE_INTEGER freq;
 		QueryPerformanceFrequency(&freq);
 		return t/double(freq);
 	}
 #else
-#	include <ctime>
+#	include <chrono>
 namespace signalsmith {
 class Stopwatch {
-	using Time = std::clock_t;
+	using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
 	using Time = Clock::time_point;
 	using Duration = std::chrono::duration<double>;
 	inline Time now() {
-		return std::clock();
+		return Clock::now();
 	}
-	static double timeToSeconds(double t) {
+	static double toSeconds(Duration duration) {
-		return t/double(CLOCKS_PER_SEC);
+		return duration.count();
 	}
 #endif
 	std::atomic<Time> lapStart; // the atomic store/load should act as barriers for reordering operations
-	Time lapBest, lapTotal, lapTotal2;
+	double lapBest, lapTotal, lapTotal2;
 	double lapOverhead = 0;
 	int lapCount = 0;
@ -53,23 +56,22 @@ public:
 		}
 		start();
 	}
-
+	// Explicit because std::atomic<> can't be copied/moved
-	static double seconds(double time) {
+	Stopwatch(const Stopwatch &other) : lapBest(other.lapBest), lapTotal(other.lapTotal), lapTotal2(other.lapTotal2), lapOverhead(other.lapOverhead), lapCount(other.lapCount) {
-		return timeToSeconds(time);
+		lapStart.store(other.lapStart.load());
 	}
 	void start() {
 		lapCount = 0;
 		lapTotal = lapTotal2 = 0;
-		lapBest = std::numeric_limits<Time>::max();
+		lapBest = std::numeric_limits<double>::max();
 		startLap();
 	}
 	void startLap() {
 		lapStart.store(now());
 	}
 	double lap() {
-		auto start = lapStart.load();
+		double diff = toSeconds(now() - lapStart.load());
 		auto diff = now() - start;
 		if (diff < lapBest) lapBest = diff;
 		lapCount++;
@ -100,5 +102,6 @@ public:
 	}
 };
-} // namespace
+} //namespace
 #endif // include guard
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@ -140,8 +140,12 @@ struct SignalsmithStretch {
 	template<class Inputs, class Outputs>
 	void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
 #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_START
 		SIGNALSMITH_STRETCH_PROFILE_PROCESS_START(inputSamples, outputSamples);
 #endif
 		int prevCopiedInput = 0;
 		auto copyInput = [&](int toIndex){
 			int length = std::min<int>(stft.blockSamples() + stft.defaultInterval(), toIndex - prevCopiedInput);
 			tmpBuffer.resize(length);
 			int offset = toIndex - length;
@ -164,6 +168,7 @@ struct SignalsmithStretch {
 				totalEnergy += s*s;
 			}
 		}
 		if (totalEnergy < noiseFloor) {
 			if (silenceCounter >= 2*stft.blockSamples()) {
 				if (silenceFirst) { // first block of silence processing
@ -209,6 +214,9 @@ struct SignalsmithStretch {
 			size_t processToStep = std::min<size_t>(blockProcess.steps, blockProcess.steps*processRatio);
 			while (blockProcess.step < processToStep) {
 				size_t step = blockProcess.step++;
 #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP
 				SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP(step, blockProcess.steps);
 #endif
 				if (blockProcess.newSpectrum) {
 					if (blockProcess.reanalysePrev) {
@ -237,7 +245,7 @@ struct SignalsmithStretch {
 					// Analyse latest (stashed) input
 					if (step < stft.analyseSteps()) {
 						stashedInput.swap(stft.input);
-						stft.analyse();
+						stft.analyseStep(step);
 						stashedInput.swap(stft.input);
 						continue;
 					}
@ -257,7 +265,7 @@ struct SignalsmithStretch {
 				}
 				if (step < processSpectrumSteps) {
-					processSpectrum(step, blockProcess.newSpectrum, blockProcess.timeFactor);
+					processSpectrum(step);
 					continue;
 				}
 				step -= processSpectrumSteps;
@ -279,10 +287,7 @@ struct SignalsmithStretch {
 					stft.synthesiseStep(step);
 					continue;
 				}
-				LOG_EXPR("uh oh");
+				// This should never happen - something has gone terribly wrong
 				LOG_EXPR(processToStep);
 				LOG_EXPR(blockProcess.steps);
 				LOG_EXPR(blockProcess.step);
 				abort();
 			}
 			if (processRatio >= 1) { // we *should* have just written a block, and are now ready to start a new one
@ -301,9 +306,10 @@ struct SignalsmithStretch {
 				stft.moveOutput(stft.defaultInterval()); // the actual input jumps forward in time by one interval, ready for the synthesis
 				blockProcess.newSpectrum = didSeek || (inputInterval > 0);
 				blockProcess.mappedFrequencies = customFreqMap || freqMultiplier != 1;
 				if (blockProcess.newSpectrum) {
-					// make sure the previous input is the correct distance in the past
+					// make sure the previous input is the correct distance in the past (give or take 1 sample)
-					blockProcess.reanalysePrev = didSeek || inputInterval != int(stft.defaultInterval());
+					blockProcess.reanalysePrev = didSeek || std::abs(inputInterval - int(stft.defaultInterval())) > 1;
 					if (blockProcess.reanalysePrev) blockProcess.steps += stft.analyseSteps() + 1;
 					// analyse a new input
@ -312,11 +318,15 @@ struct SignalsmithStretch {
 				blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
 				didSeek = false;
-				
+
 				updateProcessSpectrumSteps();
 				blockProcess.steps += processSpectrumSteps;
 				blockProcess.steps += stft.synthesiseSteps() + 1;
 			}
 #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP
 			SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP();
 #endif
 			++blockProcess.samplesSinceLast;
 			stashedOutput.swap(stft.output);
@ -332,6 +342,9 @@ struct SignalsmithStretch {
 		copyInput(inputSamples);
 		prevInputOffset -= inputSamples;
 #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_END
 		SIGNALSMITH_STRETCH_PROFILE_PROCESS_END();
 #endif
 	}
 	// Read the remaining output, providing no further input.  `outputSamples` should ideally be at least `.outputLatency()`
@ -372,6 +385,7 @@ private:
 		bool newSpectrum = false;
 		bool reanalysePrev = false;
 		bool mappedFrequencies = false;
 		Sample timeFactor;
 	} blockProcess;
@ -477,195 +491,224 @@ private:
 	RandomEngine randomEngine;
-	static constexpr size_t processSpectrumSteps = 6;
+	size_t processSpectrumSteps = 0;
-	void processSpectrum(size_t step, bool newSpectrum, Sample timeFactor) {
+	static constexpr size_t splitMainPrediction = 8; // it's just heavy, since we're blending up to 4 different phase predictions
 	void updateProcessSpectrumSteps() {
 		processSpectrumSteps = 0;
 		if (blockProcess.newSpectrum) processSpectrumSteps += channels;
 		if (blockProcess.mappedFrequencies) {
 			processSpectrumSteps += smoothEnergySteps;
 			processSpectrumSteps += 1; // findPeaks
 		}
 		processSpectrumSteps += 1; // updating the output map
 		processSpectrumSteps += channels; // preliminary phase-vocoder prediction
 		processSpectrumSteps += splitMainPrediction;
 		if (blockProcess.newSpectrum) processSpectrumSteps += 1; // .input -> .prevInput
 	}
 	void processSpectrum(size_t step) {
 		Sample timeFactor = blockProcess.timeFactor;
 		Sample smoothingBins = Sample(stft.fftSamples())/stft.defaultInterval();
 		int longVerticalStep = std::round(smoothingBins);
 		timeFactor = std::max<Sample>(timeFactor, 1/maxCleanStretch);
 		bool randomTimeFactor = (timeFactor > maxCleanStretch);
 		std::uniform_real_distribution<Sample> timeFactorDist(maxCleanStretch*2*randomTimeFactor - timeFactor, timeFactor);
-		switch(step) {
+		if (blockProcess.newSpectrum) {
-			case 1: {
+			if (step < size_t(channels)) {
-				if (newSpectrum) {
+				int channel = step;
-					for (int c = 0; c < channels; ++c) {
+				auto bins = bandsForChannel(channel);
 						auto bins = bandsForChannel(c);
-						Complex rot = std::polar(Sample(1), bandToFreq(0)*stft.defaultInterval()*Sample(2*M_PI));
+				Complex rot = std::polar(Sample(1), bandToFreq(0)*stft.defaultInterval()*Sample(2*M_PI));
-						Sample freqStep = bandToFreq(1) - bandToFreq(0);
+				Sample freqStep = bandToFreq(1) - bandToFreq(0);
-						Complex rotStep = std::polar(Sample(1), freqStep*stft.defaultInterval()*Sample(2*M_PI));
+				Complex rotStep = std::polar(Sample(1), freqStep*stft.defaultInterval()*Sample(2*M_PI));
-						
+				 
-						for (int b = 0; b < bands; ++b) {
+				for (int b = 0; b < bands; ++b) {
-							auto &bin = bins[b];
+					auto &bin = bins[b];
-							bin.output = _impl::mul(bin.output, rot);
+					bin.output = _impl::mul(bin.output, rot);
-							bin.prevInput = _impl::mul(bin.prevInput, rot);
+					bin.prevInput = _impl::mul(bin.prevInput, rot);
-							rot = _impl::mul(rot, rotStep);
+					rot = _impl::mul(rot, rotStep);
 						}
 					}
 				}
 				return;
 			}
-			case 2: {
+			step -= channels;
-				if (customFreqMap || freqMultiplier != 1) {
+		}
-					findPeaks(smoothingBins);
+		if (blockProcess.mappedFrequencies) {
-				}
+			if (step < smoothEnergySteps) {
 				smoothEnergy(step, smoothingBins);
 				return;
 			}
-			case 3: {
+			step -= smoothEnergySteps;
-				if (customFreqMap || freqMultiplier != 1) {
+			if (step-- == 0) {
-					updateOutputMap();
+				findPeaks();
 				} else { // we're not pitch-shifting, so no need to find peaks etc.
 					for (int c = 0; c < channels; ++c) {
 						Band *bins = bandsForChannel(c);
 						for (int b = 0; b < bands; ++b) {
 							bins[b].inputEnergy = std::norm(bins[b].input);
 						}
 					}
 					for (int b = 0; b < bands; ++b) {
 						outputMap[b] = {Sample(b), 1};
 					}
 				}
 				return;
 			}
-			case 4: {
+		}
-				// Preliminary output prediction from phase-vocoder
+		if (step-- == 0) {
 			if (blockProcess.mappedFrequencies) {
 				updateOutputMap();
 			} else { // we're not pitch-shifting, so no need to find peaks etc.
 				for (int c = 0; c < channels; ++c) {
 					Band *bins = bandsForChannel(c);
 					auto *predictions = predictionsForChannel(c);
 					for (int b = 0; b < bands; ++b) {
-						auto mapPoint = outputMap[b];
+						bins[b].inputEnergy = std::norm(bins[b].input);
 						int lowIndex = std::floor(mapPoint.inputBin);
 						Sample fracIndex = mapPoint.inputBin - lowIndex;
 						Prediction &prediction = predictions[b];
 						Sample prevEnergy = prediction.energy;
 						prediction.energy = getFractional<&Band::inputEnergy>(c, lowIndex, fracIndex);
 						prediction.energy *= std::max<Sample>(0, mapPoint.freqGrad); // scale the energy according to local stretch factor
 						prediction.input = getFractional<&Band::input>(c, lowIndex, fracIndex);
 						auto &outputBin = bins[b];
 						Complex prevInput = getFractional<&Band::prevInput>(c, lowIndex, fracIndex);
 						Complex freqTwist = _impl::mul<true>(prediction.input, prevInput);
 						Complex phase = _impl::mul(outputBin.output, freqTwist);
 						outputBin.output = phase/(std::max(prevEnergy, prediction.energy) + noiseFloor);
 					}
 				}
 				return;
 			}
 			case 5: {
 				// Re-predict using phase differences between frequencies
 				for (int b = 0; b < bands; ++b) {
-					// Find maximum-energy channel and calculate that
+					outputMap[b] = {Sample(b), 1};
 					int maxChannel = 0;
 					Sample maxEnergy = predictionsForChannel(0)[b].energy;
 					for (int c = 1; c < channels; ++c) {
 						Sample e = predictionsForChannel(c)[b].energy;
 						if (e > maxEnergy) {
 							maxChannel = c;
 							maxEnergy = e;
 						}
 					}
 					auto *predictions = predictionsForChannel(maxChannel);
 					auto &prediction = predictions[b];
 					auto *bins = bandsForChannel(maxChannel);
 					auto &outputBin = bins[b];
 					Complex phase = 0;
 					auto mapPoint = outputMap[b];
 					// Upwards vertical steps
 					if (b > 0) {
 						Sample binTimeFactor = randomTimeFactor ? timeFactorDist(randomEngine) : timeFactor;
 						Complex downInput = getFractional<&Band::input>(maxChannel, mapPoint.inputBin - binTimeFactor);
 						Complex shortVerticalTwist = _impl::mul<true>(prediction.input, downInput);
 						auto &downBin = bins[b - 1];
 						phase += _impl::mul(downBin.output, shortVerticalTwist);
 						if (b >= longVerticalStep) {
 							Complex longDownInput = getFractional<&Band::input>(maxChannel, mapPoint.inputBin - longVerticalStep*binTimeFactor);
 							Complex longVerticalTwist = _impl::mul<true>(prediction.input, longDownInput);
 							auto &longDownBin = bins[b - longVerticalStep];
 							phase += _impl::mul(longDownBin.output, longVerticalTwist);
 						}
 					}
 					// Downwards vertical steps
 					if (b < bands - 1) {
 						auto &upPrediction = predictions[b + 1];
 						auto &upMapPoint = outputMap[b + 1];
 						Sample binTimeFactor = randomTimeFactor ? timeFactorDist(randomEngine) : timeFactor;
 						Complex downInput = getFractional<&Band::input>(maxChannel, upMapPoint.inputBin - binTimeFactor);
 						Complex shortVerticalTwist = _impl::mul<true>(upPrediction.input, downInput);
 						auto &upBin = bins[b + 1];
 						phase += _impl::mul<true>(upBin.output, shortVerticalTwist);
 						if (b < bands - longVerticalStep) {
 							auto &longUpPrediction = predictions[b + longVerticalStep];
 							auto &longUpMapPoint = outputMap[b + longVerticalStep];
 							Complex longDownInput = getFractional<&Band::input>(maxChannel, longUpMapPoint.inputBin - longVerticalStep*binTimeFactor);
 							Complex longVerticalTwist = _impl::mul<true>(longUpPrediction.input, longDownInput);
 							auto &longUpBin = bins[b + longVerticalStep];
 							phase += _impl::mul<true>(longUpBin.output, longVerticalTwist);
 						}
 					}
 					outputBin.output = prediction.makeOutput(phase);
 					// All other bins are locked in phase
 					for (int c = 0; c < channels; ++c) {
 						if (c != maxChannel) {
 							auto &channelBin = bandsForChannel(c)[b];
 							auto &channelPrediction = predictionsForChannel(c)[b];
 							Complex channelTwist = _impl::mul<true>(channelPrediction.input, prediction.input);
 							Complex channelPhase = _impl::mul(outputBin.output, channelTwist);
 							channelBin.output = channelPrediction.makeOutput(channelPhase);
 						}
 					}
 				}
 				if (newSpectrum) {
 					for (auto &bin : channelBands) {
 						bin.prevInput = bin.input;
 					}
 				}
 				return;
 			}
-		} // switch
+			return;
 		}
 		if (step < size_t(channels)) {
 			size_t c = step;
 			Band *bins = bandsForChannel(c);
 			auto *predictions = predictionsForChannel(c);
 			for (int b = 0; b < bands; ++b) {
 				auto mapPoint = outputMap[b];
 				int lowIndex = std::floor(mapPoint.inputBin);
 				Sample fracIndex = mapPoint.inputBin - lowIndex;
 				Prediction &prediction = predictions[b];
 				Sample prevEnergy = prediction.energy;
 				prediction.energy = getFractional<&Band::inputEnergy>(c, lowIndex, fracIndex);
 				prediction.energy *= std::max<Sample>(0, mapPoint.freqGrad); // scale the energy according to local stretch factor
 				prediction.input = getFractional<&Band::input>(c, lowIndex, fracIndex);
 				auto &outputBin = bins[b];
 				Complex prevInput = getFractional<&Band::prevInput>(c, lowIndex, fracIndex);
 				Complex freqTwist = _impl::mul<true>(prediction.input, prevInput);
 				Complex phase = _impl::mul(outputBin.output, freqTwist);
 				outputBin.output = phase/(std::max(prevEnergy, prediction.energy) + noiseFloor);
 			}
 			return;
 		}
 		step -= channels;
 		if (step < splitMainPrediction) {
 			// Re-predict using phase differences between frequencies
 			int chunk = step;
 			int startB = bands*chunk/splitMainPrediction;
 			int endB = bands*(chunk + 1)/splitMainPrediction;
 			for (int b = startB; b < endB; ++b) {
 				// Find maximum-energy channel and calculate that
 				int maxChannel = 0;
 				Sample maxEnergy = predictionsForChannel(0)[b].energy;
 				for (int c = 1; c < channels; ++c) {
 					Sample e = predictionsForChannel(c)[b].energy;
 					if (e > maxEnergy) {
 						maxChannel = c;
 						maxEnergy = e;
 					}
 				}
 				auto *predictions = predictionsForChannel(maxChannel);
 				auto &prediction = predictions[b];
 				auto *bins = bandsForChannel(maxChannel);
 				auto &outputBin = bins[b];
 				Complex phase = 0;
 				auto mapPoint = outputMap[b];
 				// Upwards vertical steps
 				if (b > 0) {
 					Sample binTimeFactor = randomTimeFactor ? timeFactorDist(randomEngine) : timeFactor;
 					Complex downInput = getFractional<&Band::input>(maxChannel, mapPoint.inputBin - binTimeFactor);
 					Complex shortVerticalTwist = _impl::mul<true>(prediction.input, downInput);
 					auto &downBin = bins[b - 1];
 					phase += _impl::mul(downBin.output, shortVerticalTwist);
 					if (b >= longVerticalStep) {
 						Complex longDownInput = getFractional<&Band::input>(maxChannel, mapPoint.inputBin - longVerticalStep*binTimeFactor);
 						Complex longVerticalTwist = _impl::mul<true>(prediction.input, longDownInput);
 						auto &longDownBin = bins[b - longVerticalStep];
 						phase += _impl::mul(longDownBin.output, longVerticalTwist);
 					}
 				}
 				// Downwards vertical steps
 				if (b < bands - 1) {
 					auto &upPrediction = predictions[b + 1];
 					auto &upMapPoint = outputMap[b + 1];
 					Sample binTimeFactor = randomTimeFactor ? timeFactorDist(randomEngine) : timeFactor;
 					Complex downInput = getFractional<&Band::input>(maxChannel, upMapPoint.inputBin - binTimeFactor);
 					Complex shortVerticalTwist = _impl::mul<true>(upPrediction.input, downInput);
 					auto &upBin = bins[b + 1];
 					phase += _impl::mul<true>(upBin.output, shortVerticalTwist);
 					if (b < bands - longVerticalStep) {
 						auto &longUpPrediction = predictions[b + longVerticalStep];
 						auto &longUpMapPoint = outputMap[b + longVerticalStep];
 						Complex longDownInput = getFractional<&Band::input>(maxChannel, longUpMapPoint.inputBin - longVerticalStep*binTimeFactor);
 						Complex longVerticalTwist = _impl::mul<true>(longUpPrediction.input, longDownInput);
 						auto &longUpBin = bins[b + longVerticalStep];
 						phase += _impl::mul<true>(longUpBin.output, longVerticalTwist);
 					}
 				}
 				outputBin.output = prediction.makeOutput(phase);
 				// All other bins are locked in phase
 				for (int c = 0; c < channels; ++c) {
 					if (c != maxChannel) {
 						auto &channelBin = bandsForChannel(c)[b];
 						auto &channelPrediction = predictionsForChannel(c)[b];
 						Complex channelTwist = _impl::mul<true>(channelPrediction.input, prediction.input);
 						Complex channelPhase = _impl::mul(outputBin.output, channelTwist);
 						channelBin.output = channelPrediction.makeOutput(channelPhase);
 					}
 				}
 			}
 			return;
 		}
 		step -= splitMainPrediction;
 		if (blockProcess.newSpectrum) {
 			for (auto &bin : channelBands) {
 				bin.prevInput = bin.input;
 			}
 		}
 	}
 	// Produces smoothed energy across all channels
-	void smoothEnergy(Sample smoothingBins) {
+	static constexpr size_t smoothEnergySteps = 3;
 	Sample smoothEnergyState = 0;
 	void smoothEnergy(size_t step, Sample smoothingBins) {
 		Sample smoothingSlew = 1/(1 + smoothingBins*Sample(0.5));
-		for (auto &e : energy) e = 0;
+		if (step-- == 0) {
-		for (int c = 0; c < channels; ++c) {
+			for (auto &e : energy) e = 0;
-			Band *bins = bandsForChannel(c);
+			for (int c = 0; c < channels; ++c) {
-			for (int b = 0; b < bands; ++b) {
+				Band *bins = bandsForChannel(c);
-				Sample e = std::norm(bins[b].input);
+				for (int b = 0; b < bands; ++b) {
-				bins[b].inputEnergy = e; // Used for interpolating prediction energy
+					Sample e = std::norm(bins[b].input);
-				energy[b] += e;
+					bins[b].inputEnergy = e; // Used for interpolating prediction energy
 					energy[b] += e;
 				}
 			}
 			for (int b = 0; b < bands; ++b) {
 				smoothedEnergy[b] = energy[b];
 			}
 			smoothEnergyState = 0;
 			return;
 		}
 		// The two other steps are repeated smoothing passes, down and up
 		Sample e = smoothEnergyState;
 		for (int b = bands - 1; b >= 0; --b) {
 			e += (smoothedEnergy[b] - e)*smoothingSlew;
 			smoothedEnergy[b] = e;
 		}
 		for (int b = 0; b < bands; ++b) {
-			smoothedEnergy[b] = energy[b];
+			e += (smoothedEnergy[b] - e)*smoothingSlew;
-		}
+			smoothedEnergy[b] = e;
 		Sample e = 0;
 		for (int repeat = 0; repeat < 2; ++repeat) {
 			for (int b = bands - 1; b >= 0; --b) {
 				e += (smoothedEnergy[b] - e)*smoothingSlew;
 				smoothedEnergy[b] = e;
 			}
 			for (int b = 0; b < bands; ++b) {
 				e += (smoothedEnergy[b] - e)*smoothingSlew;
 				smoothedEnergy[b] = e;
 			}
 		}
 		smoothEnergyState = e;
 	}
 	Sample mapFreq(Sample freq) const {
@ -678,9 +721,7 @@ private:
 	}
 	// Identifies spectral peaks using energy across all channels
-	void findPeaks(Sample smoothingBins) {
+	void findPeaks() {
 		smoothEnergy(smoothingBins);
 		peaks.resize(0);
 		int start = 0;