Refactor, split formants into 3 computation steps

2025-04-18 21:03:15 +01:00 · 2025-04-18 21:03:15 +01:00 · b84e9cf5e9
commit b84e9cf5e9
parent 004a52b30d
3 changed files with 99 additions and 91 deletions
--- a/cmd/Makefile
+++ b/cmd/Makefile
@ -32,6 +32,7 @@ dev: out/stretch
 	out/stretch --time=0.8 --semitones=10 --formant-comp $(TEST_WAV) out/shift-fc.wav
 	out/stretch --time=0.8 --semitones=10 --formant-comp --formant=3 $(TEST_WAV) out/shift-fc-f3.wav
 	out/stretch --time=0.8 --semitones=10 --formant-comp --formant=3 --formant-base=500 $(TEST_WAV) out/shift-fc-f3-fb500.wav
+	out/stretch --time=0.8 --semitones=10 --formant-comp --formant=2 --formant-base=100 $(TEST_WAV) out/shift-fc-f2-fb100.wav

 clean:
 	rm -rf out
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@ -2,6 +2,7 @@
 #include <iostream>
 #define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << "\n";

+#define PROFILE_PLOT_CHUNKS
 #ifdef PROFILE_PLOT_CHUNKS
 size_t activeStepIndex = 0;
 void profileProcessStart(int, int);
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@ -257,7 +257,6 @@ struct SignalsmithStretch {
 				}
 				
 				blockProcess.processFormants = formantMultiplier != 1 || (formantCompensation && blockProcess.mappedFrequencies);
-				if (blockProcess.processFormants) ++blockProcess.steps;

 				blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
 				didSeek = false;
@ -540,6 +539,7 @@ private:
 		processSpectrumSteps += channels; // preliminary phase-vocoder prediction
 		processSpectrumSteps += splitMainPrediction;
 		if (blockProcess.newSpectrum) processSpectrumSteps += 1; // .input -> .prevInput
+		if (blockProcess.processFormants) processSpectrumSteps += 3;
 	}
 	void processSpectrum(size_t step) {
 		Sample timeFactor = blockProcess.timeFactor;
@ -598,10 +598,11 @@ private:
 			return;
 		}
 		if (blockProcess.processFormants) {
-			if (step-- == 0) {
-				updateFormants(0);
+			if (step < 3) {
+				updateFormants(step);
 				return;
 			}
+			step -= 3;
 		}
 		// Preliminary output prediction from phase-vocoder
 		if (step < size_t(channels)) {
@ -836,99 +837,104 @@ private:

 	Sample freqEstimateWeighted = 0;
 	Sample freqEstimateWeight = 0;
+	Sample estimateFrequency() {
+		// 3 highest peaks in the input
+		std::array<int, 3> peakIndices{0, 0, 0};
+		for (int b = 1; b < bands - 1; ++b) {
+			Sample e = formantMetric[b];
+			// local maxima only
+			if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue;
+			
+			if (e > formantMetric[peakIndices[0]]) {
+				if (e > formantMetric[peakIndices[1]]) {
+					if (e > formantMetric[peakIndices[2]]) {
+						peakIndices = {peakIndices[1], peakIndices[2], b};
+					} else {
+						peakIndices = {peakIndices[1], b, peakIndices[2]};
+					}
+				} else {
+					peakIndices[0] = b;
+				}
+			}
+		}
+		
+		// VERY rough pitch estimation
+		int peakEstimate = peakIndices[2];
+		if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) {
+			int diff = std::abs(peakEstimate - peakIndices[1]);
+			if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
+			if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) {
+				int diff = std::abs(peakEstimate - peakIndices[0]);
+				if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
+			}
+		}
+		Sample weight = formantMetric[peakIndices[2]];
+		// Smooth it out a bit
+		freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25;
+		freqEstimateWeight += (weight - freqEstimateWeight)*0.25;
+		
+		return freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30));
+	}
+	
+	Sample freqEstimate;
 	
 	std::vector<Sample> formantMetric;
 	Sample formantBaseFreq = 0;
-	void updateFormants(size_t) {
-		return;
-		for (auto &e : formantMetric) e = 0;
-		for (int c = 0; c < channels; ++c) {
-			Band *bins = bandsForChannel(c);
-			for (int b = 0; b < bands; ++b) {
-				formantMetric[b] += bins[b].inputEnergy;
-			}
-		}
-
-		Sample freqEstimate = freqToBand(formantBaseFreq);
-		if (formantBaseFreq <= 0) {
-			// 3 highest peaks in the input
-			std::array<int, 3> peakIndices{0, 0, 0};
-			for (int b = 1; b < bands - 1; ++b) {
-				Sample e = formantMetric[b];
-				// local maxima only
-				if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue;
-				
-				if (e > formantMetric[peakIndices[0]]) {
-					if (e > formantMetric[peakIndices[1]]) {
-						if (e > formantMetric[peakIndices[2]]) {
-							peakIndices = {peakIndices[1], peakIndices[2], b};
-						} else {
-							peakIndices = {peakIndices[1], b, peakIndices[2]};
-						}
-					} else {
-						peakIndices[0] = b;
-					}
-				}
-			}
-			
-			// VERY rough pitch estimation
-			int peakEstimate = peakIndices[2];
-			if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) {
-				int diff = std::abs(peakEstimate - peakIndices[1]);
-				if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
-				if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) {
-					int diff = std::abs(peakEstimate - peakIndices[0]);
-					if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
-				}
-			}
-			Sample weight = formantMetric[peakIndices[2]];
-			// Smooth it out a bit
-			freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25;
-			freqEstimateWeight += (weight - freqEstimateWeight)*0.25;
-			
-			freqEstimate = freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30));
-		}
-	
-		for (int b = 0; b < bands; ++b) {
-			formantMetric[b] = std::sqrt(std::sqrt(formantMetric[b]));
-		}
-		Sample slew = 1/(freqEstimate*0.71 + 1);
-		Sample e = 0;
-		for (int repeat = 0; repeat < 1; ++repeat) {
-			for (int b = bands - 1; b >= 0; --b) {
-				e += (formantMetric[b] - e)*slew;
-				formantMetric[b] = e;
-			}
-			for (int b = 0; b < bands; ++b) {
-				e += (formantMetric[b] - e)*slew;
-				formantMetric[b] = e;
-			}
-		}
-		
-		auto getFormant = [&](Sample band) -> Sample {
-			if (band < 0) return 0;
-			band = std::min<Sample>(band, bands);
-			int floorBand = std::floor(band);
-			Sample fracBand = band - floorBand;
-			Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1];
-			return low + (high - low)*fracBand;
-		};
-		
-		for (int b = 0; b < bands; ++b) {
-			Sample inputF = bandToFreq(b);
-			Sample outputF = formantCompensation ? mapFreq(inputF) : inputF;
-			outputF = invMapFormant(outputF);
-
-			Sample inputE = formantMetric[b];
-			Sample targetE = getFormant(freqToBand(outputF));
-
-			Sample formantRatio = targetE/(inputE + Sample(1e-30));
-			Sample energyRatio = (formantRatio*formantRatio)*(formantRatio*formantRatio);
-
+	void updateFormants(size_t step) {
+		if (step-- == 0) {
+			for (auto &e : formantMetric) e = 0;
 			for (int c = 0; c < channels; ++c) {
 				Band *bins = bandsForChannel(c);
-				// This is what's used to decide the output energy, so this affects the output
-				bins[b].inputEnergy *= energyRatio;
+				for (int b = 0; b < bands; ++b) {
+					formantMetric[b] += bins[b].inputEnergy;
+				}
+			}
+
+			freqEstimate = freqToBand(formantBaseFreq);
+			if (formantBaseFreq <= 0) freqEstimate = estimateFrequency();
+
+			for (int b = 0; b < bands; ++b) {
+				formantMetric[b] = std::sqrt(formantMetric[b]);
+			}
+		} else if (step-- == 0) {
+			Sample slew = 1/(freqEstimate*0.5 + 1);
+			Sample e = 0;
+			for (size_t repeat = 0; repeat < 2; ++repeat) {
+				for (int b = bands - 1; b >= 0; --b) {
+					e += (formantMetric[b] - e)*slew;
+					formantMetric[b] = e;
+				}
+				for (int b = 0; b < bands; ++b) {
+					e += (formantMetric[b] - e)*slew;
+					formantMetric[b] = e;
+				}
+			}
+		} else {
+			auto getFormant = [&](Sample band) -> Sample {
+				if (band < 0) return 0;
+				band = std::min<Sample>(band, bands);
+				int floorBand = std::floor(band);
+				Sample fracBand = band - floorBand;
+				Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1];
+				return low + (high - low)*fracBand;
+			};
+
+			for (int b = 0; b < bands; ++b) {
+				Sample inputF = bandToFreq(b);
+				Sample outputF = formantCompensation ? mapFreq(inputF) : inputF;
+				outputF = invMapFormant(outputF);
+
+				Sample inputE = formantMetric[b];
+				Sample targetE = getFormant(freqToBand(outputF));
+
+				Sample formantRatio = targetE/(inputE + Sample(1e-30));
+				Sample energyRatio = formantRatio*formantRatio;
+
+				for (int c = 0; c < channels; ++c) {
+					Band *bins = bandsForChannel(c);
+					// This is what's used to decide the output energy, so this affects the output
+					bins[b].inputEnergy *= energyRatio;
+				}
 			}
 		}
 	}