Split .processSpectrum() into more steps

2025-02-21 14:47:56 +00:00 · 2025-02-21 14:47:56 +00:00 · 46d866e9fe
commit 46d866e9fe
parent f72c4f0985
5 changed files with 338 additions and 190 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -7,7 +7,7 @@ include(FetchContent)
 FetchContent_Declare(
 	signalsmith-linear
 	GIT_REPOSITORY https://github.com/Signalsmith-Audio/linear.git
-	GIT_TAG c600e0420d260469566c41e1ccb64f89ee439dd3
+	GIT_TAG 0.1.0
 	GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(signalsmith-linear)
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@ -2,6 +2,16 @@
 #include <iostream>
 #define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << "\n";

+size_t activeStepIndex = 0;
+void profileProcessStart(int, int);
+void profileProcessEndStep();
+void profileProcessStep(size_t, size_t);
+void profileProcessEnd();
+#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_START profileProcessStart
+#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP profileProcessStep
+#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP profileProcessEndStep
+#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_END profileProcessEnd
+
 #include "signalsmith-stretch/signalsmith-stretch.h"

 #include "./util/stopwatch.h"
@ -9,9 +19,44 @@
 #include "./util/simple-args.h"
 #include "./util/wav.h"

+#include "plot/plot.h"
+std::vector<signalsmith::Stopwatch> processStopwatches;
+signalsmith::Stopwatch processStopwatchStart, processStopwatchEnd;
+bool started = false;
+bool activeStep = false;
+void profileProcessStart(int /*inputSamples*/, int /*outputSamples*/) {
+	activeStep = false;
+	started = true;
+	processStopwatchStart.startLap();
+}
+void profileProcessEndStep() {
+	if (activeStep) {
+		activeStep = false;
+		processStopwatches[activeStepIndex].lap();
+	} else if (started) {
+		started = false;
+		processStopwatchStart.lap();
+	}
+	processStopwatchEnd.startLap();
+}
+void profileProcessStep(size_t step, size_t count) {
+	profileProcessEndStep();
+	activeStep = true;
+	activeStepIndex = step;
+	if (processStopwatches.size() < count) {
+		processStopwatches.resize(count);
+	}
+	processStopwatches[step].startLap();
+}
+void profileProcessEnd() {
+	processStopwatchEnd.lap();
+}
+
 int main(int argc, char* argv[]) {
 	signalsmith::stretch::SignalsmithStretch<float/*, std::ranlux48_base*/> stretch; // optional cheaper RNG for performance comparison
 	
+	processStopwatches.reserve(1000);
+
 	SimpleArgs args(argc, argv);
 	
 	if (args.hasFlag("v", "prints the version")) {
@ -56,7 +101,7 @@ int main(int argc, char* argv[]) {
 	stopwatch.start();
 	stretch.presetDefault(inWav.channels, inWav.sampleRate);
 	stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
-	double initSeconds = stopwatch.seconds(stopwatch.lap());
+	double initSeconds = stopwatch.lap();

 	initMemory = initMemory.diff();
 	std::cout << "Setup:\n\t" << initSeconds << "s\n";
@ -85,7 +130,7 @@ int main(int argc, char* argv[]) {
 	stretch.flush(outWav, tailSamples);
 	outWav.offset -= outputLength;

-	double processSeconds = stopwatch.seconds(stopwatch.lap());
+	double processSeconds = stopwatch.lap();
 	double processRate = (inWav.length()/inWav.sampleRate)/processSeconds;
 	double processPercent = 100/processRate;
 	processMemory = processMemory.diff();
@ -109,6 +154,31 @@ int main(int argc, char* argv[]) {
 		// the `.flush()` call already handled foldback stuff at the end (since we asked for a shorter `tailSamples`)
 	}

+	signalsmith::plot::Plot2D plot(400, 150);
+	plot.x.major(0, "").label("step");
+	plot.y.major(0).label("time spent");
+	auto &line = plot.line().fillToY(0);
+	auto &extraLine = plot.line().fillToY(0);
+	for (size_t i = 0; i < processStopwatches.size(); ++i) {
+		double time = processStopwatches[i].total();
+		if (i%5 == 0) {
+			plot.x.tick(i + 0.5, std::to_string(i));
+		} else {
+			plot.x.tick(i + 0.5, "");
+		}
+		line.add(i, time);
+		line.add(i + 1, time);
+	}
+	extraLine.add(-1, 0);
+	extraLine.add(-1, processStopwatchStart.total());
+	extraLine.add(0, processStopwatchStart.total());
+	extraLine.add(0, 0);
+	extraLine.add(processStopwatches.size(), 0);
+	extraLine.add(processStopwatches.size(), processStopwatchEnd.total());
+	extraLine.add(processStopwatches.size() + 1, processStopwatchEnd.total());
+	extraLine.add(processStopwatches.size() + 1, 0);
+	plot.write("profile.svg");
+
 	if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV");
 	
 	if (compareReference && prevWav.result) {
--- a/cmd/util/stop-denormals.h
+++ b/cmd/util/stop-denormals.h
@ -0,0 +1,34 @@
+#pragma once
+
+#if defined(__SSE__) || defined(_M_X64)
+	class StopDenormals {
+		unsigned int controlStatusRegister;
+	public:
+		StopDenormals() : controlStatusRegister(_mm_getcsr()) {
+			_mm_setcsr(controlStatusRegister|0x8040); // Flush-to-Zero and Denormals-Are-Zero
+		}
+		~StopDenormals() {
+			_mm_setcsr(controlStatusRegister);
+		}
+	};
+#elif (defined (__ARM_NEON) || defined (__ARM_NEON__))
+	class StopDenormals {
+		uintptr_t status;
+	public:
+		StopDenormals() {
+			uintptr_t asmStatus;
+			asm volatile("mrs %0, fpcr" : "=r"(asmStatus));
+			status = asmStatus = asmStatus|0x01000000U; // Flush to Zero
+			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
+		}
+		~StopDenormals() {
+			uintptr_t asmStatus = status;
+			asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
+		}
+	};
+#else
+#	if __cplusplus >= 202302L
+# 		warning "The `StopDenormals` class doesn't do anything for this architecture"
+#	endif
+	class StopDenormals {}; // FIXME: add for other architectures
+#endif
--- a/cmd/util/stopwatch.h
+++ b/cmd/util/stopwatch.h
@ -6,37 +6,40 @@
 #include <atomic>
 #include <algorithm>

-// We want CPU time, not wall-clock time, so we can't use `std::chrono::high_resolution_clock`
-#ifdef WINDOWS
+#ifdef WINDOWS // completely untested!
 #	include <windows.h>
 namespace signalsmith {
 class Stopwatch {
 	using Time = __int64;
+	using Duration = Time;
 	inline Time now() {
 		LARGE_INTEGER result;
 		QueryPerformanceCounter(&result);
 		return result.QuadPart;
 	}
-	static double timeToSeconds(double t) {
+	static double toSeconds(Duration t) {
 		LARGE_INTEGER freq;
 		QueryPerformanceFrequency(&freq);
 		return t/double(freq);
 	}
 #else
-#	include <ctime>
+#	include <chrono>
 namespace signalsmith {
 class Stopwatch {
-	using Time = std::clock_t;
+	using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
+	using Time = Clock::time_point;
+	using Duration = std::chrono::duration<double>;
+	
 	inline Time now() {
-		return std::clock();
+		return Clock::now();
 	}
-	static double timeToSeconds(double t) {
-		return t/double(CLOCKS_PER_SEC);
+	static double toSeconds(Duration duration) {
+		return duration.count();
 	}
 #endif

 	std::atomic<Time> lapStart; // the atomic store/load should act as barriers for reordering operations
-	Time lapBest, lapTotal, lapTotal2;
+	double lapBest, lapTotal, lapTotal2;
 	double lapOverhead = 0;
 	int lapCount = 0;
 	
@ -53,23 +56,22 @@ public:
 		}
 		start();
 	}
-
-	static double seconds(double time) {
-		return timeToSeconds(time);
+	// Explicit because std::atomic<> can't be copied/moved
+	Stopwatch(const Stopwatch &other) : lapBest(other.lapBest), lapTotal(other.lapTotal), lapTotal2(other.lapTotal2), lapOverhead(other.lapOverhead), lapCount(other.lapCount) {
+		lapStart.store(other.lapStart.load());
 	}

 	void start() {
 		lapCount = 0;
 		lapTotal = lapTotal2 = 0;
-		lapBest = std::numeric_limits<Time>::max();
+		lapBest = std::numeric_limits<double>::max();
 		startLap();
 	}
 	void startLap() {
 		lapStart.store(now());
 	}
 	double lap() {
-		auto start = lapStart.load();
-		auto diff = now() - start;
+		double diff = toSeconds(now() - lapStart.load());

 		if (diff < lapBest) lapBest = diff;
 		lapCount++;
@ -100,5 +102,6 @@ public:
 	}
 };

-} // namespace
+} //namespace
+
 #endif // include guard
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@ -140,8 +140,12 @@ struct SignalsmithStretch {
 	
 	template<class Inputs, class Outputs>
 	void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
+#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_START
+		SIGNALSMITH_STRETCH_PROFILE_PROCESS_START(inputSamples, outputSamples);
+#endif
 		int prevCopiedInput = 0;
 		auto copyInput = [&](int toIndex){
+
 			int length = std::min<int>(stft.blockSamples() + stft.defaultInterval(), toIndex - prevCopiedInput);
 			tmpBuffer.resize(length);
 			int offset = toIndex - length;
@ -164,6 +168,7 @@ struct SignalsmithStretch {
 				totalEnergy += s*s;
 			}
 		}
+
 		if (totalEnergy < noiseFloor) {
 			if (silenceCounter >= 2*stft.blockSamples()) {
 				if (silenceFirst) { // first block of silence processing
@ -209,6 +214,9 @@ struct SignalsmithStretch {
 			size_t processToStep = std::min<size_t>(blockProcess.steps, blockProcess.steps*processRatio);
 			while (blockProcess.step < processToStep) {
 				size_t step = blockProcess.step++;
+#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP
+				SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP(step, blockProcess.steps);
+#endif
 				
 				if (blockProcess.newSpectrum) {
 					if (blockProcess.reanalysePrev) {
@ -237,7 +245,7 @@ struct SignalsmithStretch {
 					// Analyse latest (stashed) input
 					if (step < stft.analyseSteps()) {
 						stashedInput.swap(stft.input);
-						stft.analyse();
+						stft.analyseStep(step);
 						stashedInput.swap(stft.input);
 						continue;
 					}
@ -257,7 +265,7 @@ struct SignalsmithStretch {
 				}

 				if (step < processSpectrumSteps) {
-					processSpectrum(step, blockProcess.newSpectrum, blockProcess.timeFactor);
+					processSpectrum(step);
 					continue;
 				}
 				step -= processSpectrumSteps;
@ -279,10 +287,7 @@ struct SignalsmithStretch {
 					stft.synthesiseStep(step);
 					continue;
 				}
-				LOG_EXPR("uh oh");
-				LOG_EXPR(processToStep);
-				LOG_EXPR(blockProcess.steps);
-				LOG_EXPR(blockProcess.step);
+				// This should never happen - something has gone terribly wrong
 				abort();
 			}
 			if (processRatio >= 1) { // we *should* have just written a block, and are now ready to start a new one
@ -301,9 +306,10 @@ struct SignalsmithStretch {
 				stft.moveOutput(stft.defaultInterval()); // the actual input jumps forward in time by one interval, ready for the synthesis

 				blockProcess.newSpectrum = didSeek || (inputInterval > 0);
+				blockProcess.mappedFrequencies = customFreqMap || freqMultiplier != 1;
 				if (blockProcess.newSpectrum) {
-					// make sure the previous input is the correct distance in the past
-					blockProcess.reanalysePrev = didSeek || inputInterval != int(stft.defaultInterval());
+					// make sure the previous input is the correct distance in the past (give or take 1 sample)
+					blockProcess.reanalysePrev = didSeek || std::abs(inputInterval - int(stft.defaultInterval())) > 1;
 					if (blockProcess.reanalysePrev) blockProcess.steps += stft.analyseSteps() + 1;

 					// analyse a new input
@ -313,10 +319,14 @@ struct SignalsmithStretch {
 				blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
 				didSeek = false;

+				updateProcessSpectrumSteps();
 				blockProcess.steps += processSpectrumSteps;

 				blockProcess.steps += stft.synthesiseSteps() + 1;
 			}
+#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP
+			SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP();
+#endif

 			++blockProcess.samplesSinceLast;
 			stashedOutput.swap(stft.output);
@ -332,6 +342,9 @@ struct SignalsmithStretch {
 		
 		copyInput(inputSamples);
 		prevInputOffset -= inputSamples;
+#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_END
+		SIGNALSMITH_STRETCH_PROFILE_PROCESS_END();
+#endif
 	}

 	// Read the remaining output, providing no further input.  `outputSamples` should ideally be at least `.outputLatency()`
@ -372,6 +385,7 @@ private:
 		
 		bool newSpectrum = false;
 		bool reanalysePrev = false;
+		bool mappedFrequencies = false;
 		Sample timeFactor;
 	} blockProcess;

@ -477,19 +491,33 @@ private:

 	RandomEngine randomEngine;

-	static constexpr size_t processSpectrumSteps = 6;
-	void processSpectrum(size_t step, bool newSpectrum, Sample timeFactor) {
+	size_t processSpectrumSteps = 0;
+	static constexpr size_t splitMainPrediction = 8; // it's just heavy, since we're blending up to 4 different phase predictions
+	void updateProcessSpectrumSteps() {
+		processSpectrumSteps = 0;
+		if (blockProcess.newSpectrum) processSpectrumSteps += channels;
+		if (blockProcess.mappedFrequencies) {
+			processSpectrumSteps += smoothEnergySteps;
+			processSpectrumSteps += 1; // findPeaks
+		}
+		processSpectrumSteps += 1; // updating the output map
+		processSpectrumSteps += channels; // preliminary phase-vocoder prediction
+		processSpectrumSteps += splitMainPrediction;
+		if (blockProcess.newSpectrum) processSpectrumSteps += 1; // .input -> .prevInput
+	}
+	void processSpectrum(size_t step) {
+		Sample timeFactor = blockProcess.timeFactor;
+		
 		Sample smoothingBins = Sample(stft.fftSamples())/stft.defaultInterval();
 		int longVerticalStep = std::round(smoothingBins);
 		timeFactor = std::max<Sample>(timeFactor, 1/maxCleanStretch);
 		bool randomTimeFactor = (timeFactor > maxCleanStretch);
 		std::uniform_real_distribution<Sample> timeFactorDist(maxCleanStretch*2*randomTimeFactor - timeFactor, timeFactor);

-		switch(step) {
-			case 1: {
-				if (newSpectrum) {
-					for (int c = 0; c < channels; ++c) {
-						auto bins = bandsForChannel(c);
+		if (blockProcess.newSpectrum) {
+			if (step < size_t(channels)) {
+				int channel = step;
+				auto bins = bandsForChannel(channel);

 				Complex rot = std::polar(Sample(1), bandToFreq(0)*stft.defaultInterval()*Sample(2*M_PI));
 				Sample freqStep = bandToFreq(1) - bandToFreq(0);
@ -501,18 +529,23 @@ private:
 					bin.prevInput = _impl::mul(bin.prevInput, rot);
 					rot = _impl::mul(rot, rotStep);
 				}
-					}
-				}
 				return;
 			}
-			case 2: {
-				if (customFreqMap || freqMultiplier != 1) {
-					findPeaks(smoothingBins);
+			step -= channels;
 		}
+		if (blockProcess.mappedFrequencies) {
+			if (step < smoothEnergySteps) {
+				smoothEnergy(step, smoothingBins);
 				return;
 			}
-			case 3: {
-				if (customFreqMap || freqMultiplier != 1) {
+			step -= smoothEnergySteps;
+			if (step-- == 0) {
+				findPeaks();
+				return;
+			}
+		}
+		if (step-- == 0) {
+			if (blockProcess.mappedFrequencies) {
 				updateOutputMap();
 			} else { // we're not pitch-shifting, so no need to find peaks etc.
 				for (int c = 0; c < channels; ++c) {
@ -527,9 +560,8 @@ private:
 			}
 			return;
 		}
-			case 4: {
-				// Preliminary output prediction from phase-vocoder
-				for (int c = 0; c < channels; ++c) {
+		if (step < size_t(channels)) {
+			size_t c = step;
 			Band *bins = bandsForChannel(c);
 			auto *predictions = predictionsForChannel(c);
 			for (int b = 0; b < bands; ++b) {
@ -549,12 +581,16 @@ private:
 				Complex phase = _impl::mul(outputBin.output, freqTwist);
 				outputBin.output = phase/(std::max(prevEnergy, prediction.energy) + noiseFloor);
 			}
-				}
 			return;
 		}
-			case 5: {
+		step -= channels;
+
+		if (step < splitMainPrediction) {
 			// Re-predict using phase differences between frequencies
-				for (int b = 0; b < bands; ++b) {
+			int chunk = step;
+			int startB = bands*chunk/splitMainPrediction;
+			int endB = bands*(chunk + 1)/splitMainPrediction;
+			for (int b = startB; b < endB; ++b) {
 				// Find maximum-energy channel and calculate that
 				int maxChannel = 0;
 				Sample maxEnergy = predictionsForChannel(0)[b].energy;
@ -629,20 +665,23 @@ private:
 					}
 				}
 			}
+			return;
+		}
+		step -= splitMainPrediction;

-				if (newSpectrum) {
+		if (blockProcess.newSpectrum) {
 			for (auto &bin : channelBands) {
 				bin.prevInput = bin.input;
 			}
 		}
-				return;
-			}
-		} // switch
 	}
 	
 	// Produces smoothed energy across all channels
-	void smoothEnergy(Sample smoothingBins) {
+	static constexpr size_t smoothEnergySteps = 3;
+	Sample smoothEnergyState = 0;
+	void smoothEnergy(size_t step, Sample smoothingBins) {
 		Sample smoothingSlew = 1/(1 + smoothingBins*Sample(0.5));
+		if (step-- == 0) {
 			for (auto &e : energy) e = 0;
 			for (int c = 0; c < channels; ++c) {
 				Band *bins = bandsForChannel(c);
@ -655,8 +694,12 @@ private:
 			for (int b = 0; b < bands; ++b) {
 				smoothedEnergy[b] = energy[b];
 			}
-		Sample e = 0;
-		for (int repeat = 0; repeat < 2; ++repeat) {
+			smoothEnergyState = 0;
+			return;
+		}
+
+		// The two other steps are repeated smoothing passes, down and up
+		Sample e = smoothEnergyState;
 		for (int b = bands - 1; b >= 0; --b) {
 			e += (smoothedEnergy[b] - e)*smoothingSlew;
 			smoothedEnergy[b] = e;
@ -665,7 +708,7 @@ private:
 			e += (smoothedEnergy[b] - e)*smoothingSlew;
 			smoothedEnergy[b] = e;
 		}
-		}
+		smoothEnergyState = e;
 	}
 	
 	Sample mapFreq(Sample freq) const {
@ -678,9 +721,7 @@ private:
 	}
 	
 	// Identifies spectral peaks using energy across all channels
-	void findPeaks(Sample smoothingBins) {
-		smoothEnergy(smoothingBins);
-
+	void findPeaks() {
 		peaks.resize(0);
 		
 		int start = 0;