From 70e4b9c4c97a7078e10b9b85020f317afac43040 Mon Sep 17 00:00:00 2001 From: Geraint Luff Date: Wed, 4 Dec 2024 17:29:01 +0000 Subject: [PATCH] Formant shift based on (rough1) freq estimation --- cmd/Makefile | 11 +++- cmd/main.cpp | 4 ++ signalsmith-stretch.h | 126 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 136 insertions(+), 5 deletions(-) diff --git a/cmd/Makefile b/cmd/Makefile index 25d0560..e4c5e84 100644 --- a/cmd/Makefile +++ b/cmd/Makefile @@ -25,6 +25,13 @@ examples: out/stretch inputs/run-all.sh out/examples/t2- out/stretch --time=2 inputs/run-all.sh out/examples/t4- out/stretch --time=4 +TEST_WAV ?= "inputs/voice.wav" + +dev: out/stretch + out/stretch --time=0.8 --semitones=12 $(TEST_WAV) out/shift.wav + out/stretch --time=0.8 --semitones=12 --formant-comp $(TEST_WAV) out/shift-fc.wav + out/stretch --time=0.8 --semitones=12 --formant-comp --formant=3 $(TEST_WAV) out/shift-fc-f3.wav + clean: rm -rf out @@ -32,5 +39,5 @@ clean: cmake: # CMAKE_BUILD_TYPE is needed for single-config generators (e.g. Makefiles) - cmake -B build -DCMAKE_BUILD_TYPE=Release - cmake --build build --config Release \ No newline at end of file + cmake -B out/build -DCMAKE_BUILD_TYPE=Release + cmake --build out/build --config Release \ No newline at end of file diff --git a/cmd/main.cpp b/cmd/main.cpp index 587b73f..47dad4d 100644 --- a/cmd/main.cpp +++ b/cmd/main.cpp @@ -74,6 +74,8 @@ int main(int argc, char* argv[]) { std::string outputWav = args.arg("output.wav", "output WAV file"); double semitones = args.flag("semitones", "pitch-shift amount", 0); + double formants = args.flag("formant", "formant-shift amount (semitones)", 0); + bool formantComp = args.hasFlag("formant-comp", "formant compensation"); double tonality = args.flag("tonality", "tonality limit (Hz)", 8000); double time = args.flag("time", "time-stretch factor", 1); bool exactLength = args.hasFlag("exact", "trims the start/end so the output has the correct length"); @@ -86,6 +88,7 @@ int main(int argc, char* argv[]) { std::cout << "\tsemitones: " << semitones << "\n\t time: " << time << "x" << (exactLength ? " (exact)" : "") << "\n\t tonality: " << tonality << "Hz\n"; Wav inWav; + std::cout << inputWav << " -> " << outputWav << "\n"; if (!inWav.read(inputWav).warn()) args.errorExit("failed to read WAV"); size_t inputLength = inWav.samples.size()/inWav.channels; @@ -108,6 +111,7 @@ int main(int argc, char* argv[]) { stopwatch.start(); stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation); stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate); + stretch.setFormantSemitones(formants, formantComp); double initSeconds = stopwatch.lap(); initMemory = initMemory.diff(); diff --git a/signalsmith-stretch.h b/signalsmith-stretch.h index 5a538b4..52af7f2 100644 --- a/signalsmith-stretch.h +++ b/signalsmith-stretch.h @@ -2,7 +2,9 @@ #define SIGNALSMITH_STRETCH_H #include "signalsmith-linear/stft.h" // https://github.com/Signalsmith-Audio/linear + #include +#include #include #include #include @@ -34,7 +36,7 @@ struct SignalsmithStretch { SignalsmithStretch() : randomEngine(std::random_device{}()) {} SignalsmithStretch(long seed) : randomEngine(seed) {} - + int blockSamples() const { return int(stft.blockSamples()); } @@ -57,8 +59,8 @@ struct SignalsmithStretch { channelBands.assign(channelBands.size(), Band()); silenceCounter = 0; didSeek = false; - blockProcess = {}; + freqEstimateWeighted = freqEstimateWeight = 0; } // Configures using a default preset @@ -90,6 +92,7 @@ struct SignalsmithStretch { channelPredictions.resize(channels*bands); blockProcess = {}; + formantMetric.resize(bands + 2); } /// Frequency multiplier, and optional tonality limit (as multiple of sample-rate) @@ -104,13 +107,20 @@ struct SignalsmithStretch { } void setTransposeSemitones(Sample semitones, Sample tonalityLimit=0) { setTransposeFactor(std::pow(2, semitones/12), tonalityLimit); - customFreqMap = nullptr; } // Sets a custom frequency map - should be monotonically increasing void setFreqMap(std::function inputToOutput) { customFreqMap = inputToOutput; } + void setFormantFactor(Sample multiplier, bool compensatePitch=false) { + formantMultiplier = multiplier; + formantCompensation = compensatePitch; + } + void setFormantSemitones(Sample semitones, bool compensatePitch=false) { + setFormantFactor(std::pow(2, semitones/12), compensatePitch); + } + // Provide previous input ("pre-roll"), without affecting the speed calculation. You should ideally feed it one block-length + one interval template void seek(Inputs &&inputs, int inputSamples, double playbackRate) { @@ -240,6 +250,9 @@ struct SignalsmithStretch { // analyse a new input blockProcess.steps += stft.analyseSteps() + 1; } + + blockProcess.processFormants = formantMultiplier != 1 || (formantCompensation && blockProcess.mappedFrequencies); + if (blockProcess.processFormants) ++blockProcess.steps; blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max(1, inputInterval); didSeek = false; @@ -394,6 +407,7 @@ private: bool newSpectrum = false; bool reanalysePrev = false; bool mappedFrequencies = false; + bool processFormants = false; Sample timeFactor; } blockProcess; @@ -405,6 +419,9 @@ private: Sample freqMultiplier = 1, freqTonalityLimit = 0.5; std::function customFreqMap = nullptr; + + bool formantCompensation = false; // compensate for pitch/freq change + Sample formantMultiplier = 1; using STFT = signalsmith::linear::DynamicSTFT; STFT stft; @@ -568,12 +585,20 @@ private: bins[b].inputEnergy = _impl::norm(bins[b].input); } } + for (int b = 0; b < bands; ++b) { outputMap[b] = {Sample(b), 1}; } } return; } + if (blockProcess.processFormants) { + if (step-- == 0) { + updateFormants(0); + return; + } + } + // Preliminary output prediction from phase-vocoder if (step < size_t(channels)) { int c = int(step); Band *bins = bandsForChannel(c); @@ -796,6 +821,101 @@ private: outputMap[b] = {b + topOffset, 1}; } } + + Sample freqEstimateWeighted = 0; + Sample freqEstimateWeight = 0; + + std::vector formantMetric; + void updateFormants(size_t) { + for (auto &e : formantMetric) e = 0; + for (int c = 0; c < channels; ++c) { + Band *bins = bandsForChannel(c); + for (int b = 0; b < bands; ++b) { + formantMetric[b] += bins[b].inputEnergy; + } + } + + // 3 highest peaks in the input + std::array peakIndices{0, 0, 0}; + for (int b = 1; b < bands - 1; ++b) { + Sample e = formantMetric[b]; + // local maxima only + if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue; + + if (e > formantMetric[peakIndices[0]]) { + if (e > formantMetric[peakIndices[1]]) { + if (e > formantMetric[peakIndices[2]]) { + peakIndices = {peakIndices[1], peakIndices[2], b}; + } else { + peakIndices = {peakIndices[1], b, peakIndices[2]}; + } + } else { + peakIndices[0] = b; + } + } + } + + // VERY rough pitch estimation + int peakEstimate = peakIndices[2]; + if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) { + int diff = std::abs(peakEstimate - peakIndices[1]); + if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff; + if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) { + int diff = std::abs(peakEstimate - peakIndices[0]); + if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff; + } + } + Sample weight = formantMetric[peakIndices[2]]; + // Smooth it out a bit + freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25; + freqEstimateWeight += (weight - freqEstimateWeight)*0.25; + Sample freqEstimate = freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30)); + + for (int b = 0; b < bands; ++b) { + formantMetric[b] = std::sqrt(std::sqrt(formantMetric[b])); + } + Sample slew = 1/(freqEstimate*0.5 + 1); + Sample e = 0; + for (int repeat = 0; repeat < 1; ++repeat) { + for (int b = bands - 1; b >= 0; --b) { + e += (formantMetric[b] - e)*slew; + formantMetric[b] = e; + } + for (int b = 0; b < bands; ++b) { + e += (formantMetric[b] - e)*slew; + formantMetric[b] = e; + } + } + + auto getFormant = [&](Sample band) -> Sample { + if (band < 0) return 0; + band = std::min(band, bands); + int floorBand = std::floor(band); + Sample fracBand = band - floorBand; + Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1]; + return low + (high - low)*fracBand; + }; + + Sample formantMultiplierInv = 1/formantMultiplier; + + for (int b = 0; b < bands; ++b) { + Sample inputF = bandToFreq(b); + Sample outputF = formantCompensation ? mapFreq(inputF) : inputF; + outputF *= formantMultiplierInv; + + Sample inputE = formantMetric[b]; + Sample targetE = getFormant(freqToBand(outputF)); + + Sample formantRatio = targetE/(inputE + Sample(1e-30)); + Sample energyRatio = (formantRatio*formantRatio)*(formantRatio*formantRatio); + + for (int c = 0; c < channels; ++c) { + Band *bins = bandsForChannel(c); + // This is what's used to decide the output energy, so this affects the output + bins[b].inputEnergy *= energyRatio; + } + } + } }; }} // namespace