From 70e4b9c4c97a7078e10b9b85020f317afac43040 Mon Sep 17 00:00:00 2001
From: Geraint Luff <geraint@signalsmith-audio.co.uk>
Date: Wed, 4 Dec 2024 17:29:01 +0000
Subject: [PATCH] Formant shift based on (rough1) freq estimation

---
 cmd/Makefile          |  11 +++-
 cmd/main.cpp          |   4 ++
 signalsmith-stretch.h | 126 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 5 deletions(-)
diff --git a/cmd/Makefile b/cmd/Makefile
index 25d0560..e4c5e84 100644
--- a/cmd/Makefile
+++ b/cmd/Makefile
@@ -25,6 +25,13 @@ examples: out/stretch
 	inputs/run-all.sh out/examples/t2- out/stretch --time=2
 	inputs/run-all.sh out/examples/t4- out/stretch --time=4
 
+TEST_WAV ?= "inputs/voice.wav"
+
+dev: out/stretch
+	out/stretch --time=0.8 --semitones=12 $(TEST_WAV) out/shift.wav
+	out/stretch --time=0.8 --semitones=12 --formant-comp $(TEST_WAV) out/shift-fc.wav
+	out/stretch --time=0.8 --semitones=12 --formant-comp --formant=3 $(TEST_WAV) out/shift-fc-f3.wav
+
 clean:
 	rm -rf out
 
@@ -32,5 +39,5 @@ clean:
 
 cmake:
 	# CMAKE_BUILD_TYPE is needed for single-config generators (e.g. Makefiles)
-	cmake -B build -DCMAKE_BUILD_TYPE=Release
-	cmake --build build --config Release
\ No newline at end of file
+	cmake -B out/build -DCMAKE_BUILD_TYPE=Release
+	cmake --build out/build --config Release
\ No newline at end of file
diff --git a/cmd/main.cpp b/cmd/main.cpp
index 587b73f..47dad4d 100644
--- a/cmd/main.cpp
+++ b/cmd/main.cpp
@@ -74,6 +74,8 @@ int main(int argc, char* argv[]) {
 	std::string outputWav = args.arg<std::string>("output.wav", "output WAV file");
 	
 	double semitones = args.flag<double>("semitones", "pitch-shift amount", 0);
+	double formants = args.flag<double>("formant", "formant-shift amount (semitones)", 0);
+	bool formantComp = args.hasFlag("formant-comp", "formant compensation");
 	double tonality = args.flag<double>("tonality", "tonality limit (Hz)", 8000);
 	double time = args.flag<double>("time", "time-stretch factor", 1);
 	bool exactLength = args.hasFlag("exact", "trims the start/end so the output has the correct length");
@@ -86,6 +88,7 @@ int main(int argc, char* argv[]) {
 	std::cout << "\tsemitones: " << semitones << "\n\t     time: " << time << "x" << (exactLength ? " (exact)" : "") << "\n\t tonality: " << tonality << "Hz\n";
 
 	Wav inWav;
+	std::cout << inputWav << " -> " << outputWav << "\n";
 	if (!inWav.read(inputWav).warn()) args.errorExit("failed to read WAV");
 	size_t inputLength = inWav.samples.size()/inWav.channels;
 	
@@ -108,6 +111,7 @@ int main(int argc, char* argv[]) {
 	stopwatch.start();
 	stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation);
 	stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
+	stretch.setFormantSemitones(formants, formantComp);
 	double initSeconds = stopwatch.lap();
 
 	initMemory = initMemory.diff();
diff --git a/signalsmith-stretch.h b/signalsmith-stretch.h
index 5a538b4..52af7f2 100644
--- a/signalsmith-stretch.h
+++ b/signalsmith-stretch.h
@@ -2,7 +2,9 @@
 #define SIGNALSMITH_STRETCH_H
 
 #include "signalsmith-linear/stft.h" // https://github.com/Signalsmith-Audio/linear
+
 #include <vector>
+#include <array>
 #include <algorithm>
 #include <functional>
 #include <random>
@@ -34,7 +36,7 @@ struct SignalsmithStretch {
 
 	SignalsmithStretch() : randomEngine(std::random_device{}()) {}
 	SignalsmithStretch(long seed) : randomEngine(seed) {}
-
+	
 	int blockSamples() const {
 		return int(stft.blockSamples());
 	}
@@ -57,8 +59,8 @@ struct SignalsmithStretch {
 		channelBands.assign(channelBands.size(), Band());
 		silenceCounter = 0;
 		didSeek = false;
-
 		blockProcess = {};
+		freqEstimateWeighted = freqEstimateWeight = 0;
 	}
 
 	// Configures using a default preset
@@ -90,6 +92,7 @@ struct SignalsmithStretch {
 		channelPredictions.resize(channels*bands);
 
 		blockProcess = {};
+		formantMetric.resize(bands + 2);
 	}
 
 	/// Frequency multiplier, and optional tonality limit (as multiple of sample-rate)
@@ -104,13 +107,20 @@ struct SignalsmithStretch {
 	}
 	void setTransposeSemitones(Sample semitones, Sample tonalityLimit=0) {
 		setTransposeFactor(std::pow(2, semitones/12), tonalityLimit);
-		customFreqMap = nullptr;
 	}
 	// Sets a custom frequency map - should be monotonically increasing
 	void setFreqMap(std::function<Sample(Sample)> inputToOutput) {
 		customFreqMap = inputToOutput;
 	}
 
+	void setFormantFactor(Sample multiplier, bool compensatePitch=false) {
+		formantMultiplier = multiplier;
+		formantCompensation = compensatePitch;
+	}
+	void setFormantSemitones(Sample semitones, bool compensatePitch=false) {
+		setFormantFactor(std::pow(2, semitones/12), compensatePitch);
+	}
+
 	// Provide previous input ("pre-roll"), without affecting the speed calculation.  You should ideally feed it one block-length + one interval
 	template<class Inputs>
 	void seek(Inputs &&inputs, int inputSamples, double playbackRate) {
@@ -240,6 +250,9 @@ struct SignalsmithStretch {
 					// analyse a new input
 					blockProcess.steps += stft.analyseSteps() + 1;
 				}
+				
+				blockProcess.processFormants = formantMultiplier != 1 || (formantCompensation && blockProcess.mappedFrequencies);
+				if (blockProcess.processFormants) ++blockProcess.steps;
 
 				blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
 				didSeek = false;
@@ -394,6 +407,7 @@ private:
 		bool newSpectrum = false;
 		bool reanalysePrev = false;
 		bool mappedFrequencies = false;
+		bool processFormants = false;
 		Sample timeFactor;
 	} blockProcess;
 
@@ -405,6 +419,9 @@ private:
 
 	Sample freqMultiplier = 1, freqTonalityLimit = 0.5;
 	std::function<Sample(Sample)> customFreqMap = nullptr;
+	
+	bool formantCompensation = false; // compensate for pitch/freq change
+	Sample formantMultiplier = 1;
 
 	using STFT = signalsmith::linear::DynamicSTFT<Sample, false, true>;
 	STFT stft;
@@ -568,12 +585,20 @@ private:
 						bins[b].inputEnergy = _impl::norm(bins[b].input);
 					}
 				}
+
 				for (int b = 0; b < bands; ++b) {
 					outputMap[b] = {Sample(b), 1};
 				}
 			}
 			return;
 		}
+		if (blockProcess.processFormants) {
+			if (step-- == 0) {
+				updateFormants(0);
+				return;
+			}
+		}
+		// Preliminary output prediction from phase-vocoder
 		if (step < size_t(channels)) {
 			int c = int(step);
 			Band *bins = bandsForChannel(c);
@@ -796,6 +821,101 @@ private:
 			outputMap[b] = {b + topOffset, 1};
 		}
 	}
+	
+	Sample freqEstimateWeighted = 0;
+	Sample freqEstimateWeight = 0;
+	
+	std::vector<Sample> formantMetric;
+	void updateFormants(size_t) {
+		for (auto &e : formantMetric) e = 0;
+		for (int c = 0; c < channels; ++c) {
+			Band *bins = bandsForChannel(c);
+			for (int b = 0; b < bands; ++b) {
+				formantMetric[b] += bins[b].inputEnergy;
+			}
+		}
+		
+		// 3 highest peaks in the input
+		std::array<int, 3> peakIndices{0, 0, 0};
+		for (int b = 1; b < bands - 1; ++b) {
+			Sample e = formantMetric[b];
+			// local maxima only
+			if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue;
+			
+			if (e > formantMetric[peakIndices[0]]) {
+				if (e > formantMetric[peakIndices[1]]) {
+					if (e > formantMetric[peakIndices[2]]) {
+						peakIndices = {peakIndices[1], peakIndices[2], b};
+					} else {
+						peakIndices = {peakIndices[1], b, peakIndices[2]};
+					}
+				} else {
+					peakIndices[0] = b;
+				}
+			}
+		}
+		
+		// VERY rough pitch estimation
+		int peakEstimate = peakIndices[2];
+		if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) {
+			int diff = std::abs(peakEstimate - peakIndices[1]);
+			if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
+			if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) {
+				int diff = std::abs(peakEstimate - peakIndices[0]);
+				if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
+			}
+		}
+		Sample weight = formantMetric[peakIndices[2]];
+		// Smooth it out a bit
+		freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25;
+		freqEstimateWeight += (weight - freqEstimateWeight)*0.25;
+		Sample freqEstimate = freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30));
+	
+		for (int b = 0; b < bands; ++b) {
+			formantMetric[b] = std::sqrt(std::sqrt(formantMetric[b]));
+		}
+		Sample slew = 1/(freqEstimate*0.5 + 1);
+		Sample e = 0;
+		for (int repeat = 0; repeat < 1; ++repeat) {
+			for (int b = bands - 1; b >= 0; --b) {
+				e += (formantMetric[b] - e)*slew;
+				formantMetric[b] = e;
+			}
+			for (int b = 0; b < bands; ++b) {
+				e += (formantMetric[b] - e)*slew;
+				formantMetric[b] = e;
+			}
+		}
+		
+		auto getFormant = [&](Sample band) -> Sample {
+			if (band < 0) return 0;
+			band = std::min<Sample>(band, bands);
+			int floorBand = std::floor(band);
+			Sample fracBand = band - floorBand;
+			Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1];
+			return low + (high - low)*fracBand;
+		};
+		
+		Sample formantMultiplierInv = 1/formantMultiplier;
+
+		for (int b = 0; b < bands; ++b) {
+			Sample inputF = bandToFreq(b);
+			Sample outputF = formantCompensation ? mapFreq(inputF) : inputF;
+			outputF *= formantMultiplierInv;
+
+			Sample inputE = formantMetric[b];
+			Sample targetE = getFormant(freqToBand(outputF));
+
+			Sample formantRatio = targetE/(inputE + Sample(1e-30));
+			Sample energyRatio = (formantRatio*formantRatio)*(formantRatio*formantRatio);
+
+			for (int c = 0; c < channels; ++c) {
+				Band *bins = bandsForChannel(c);
+				// This is what's used to decide the output energy, so this affects the output
+				bins[b].inputEnergy *= energyRatio;
+			}
+		}
+	}
 };
 
 }} // namespace