Formant shift based on (rough1) freq estimation
This commit is contained in:
parent
54423c34db
commit
70e4b9c4c9
11
cmd/Makefile
11
cmd/Makefile
@ -25,6 +25,13 @@ examples: out/stretch
|
||||
inputs/run-all.sh out/examples/t2- out/stretch --time=2
|
||||
inputs/run-all.sh out/examples/t4- out/stretch --time=4
|
||||
|
||||
TEST_WAV ?= "inputs/voice.wav"
|
||||
|
||||
dev: out/stretch
|
||||
out/stretch --time=0.8 --semitones=12 $(TEST_WAV) out/shift.wav
|
||||
out/stretch --time=0.8 --semitones=12 --formant-comp $(TEST_WAV) out/shift-fc.wav
|
||||
out/stretch --time=0.8 --semitones=12 --formant-comp --formant=3 $(TEST_WAV) out/shift-fc-f3.wav
|
||||
|
||||
clean:
|
||||
rm -rf out
|
||||
|
||||
@ -32,5 +39,5 @@ clean:
|
||||
|
||||
cmake:
|
||||
# CMAKE_BUILD_TYPE is needed for single-config generators (e.g. Makefiles)
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake -B out/build -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build out/build --config Release
|
||||
@ -74,6 +74,8 @@ int main(int argc, char* argv[]) {
|
||||
std::string outputWav = args.arg<std::string>("output.wav", "output WAV file");
|
||||
|
||||
double semitones = args.flag<double>("semitones", "pitch-shift amount", 0);
|
||||
double formants = args.flag<double>("formant", "formant-shift amount (semitones)", 0);
|
||||
bool formantComp = args.hasFlag("formant-comp", "formant compensation");
|
||||
double tonality = args.flag<double>("tonality", "tonality limit (Hz)", 8000);
|
||||
double time = args.flag<double>("time", "time-stretch factor", 1);
|
||||
bool exactLength = args.hasFlag("exact", "trims the start/end so the output has the correct length");
|
||||
@ -86,6 +88,7 @@ int main(int argc, char* argv[]) {
|
||||
std::cout << "\tsemitones: " << semitones << "\n\t time: " << time << "x" << (exactLength ? " (exact)" : "") << "\n\t tonality: " << tonality << "Hz\n";
|
||||
|
||||
Wav inWav;
|
||||
std::cout << inputWav << " -> " << outputWav << "\n";
|
||||
if (!inWav.read(inputWav).warn()) args.errorExit("failed to read WAV");
|
||||
size_t inputLength = inWav.samples.size()/inWav.channels;
|
||||
|
||||
@ -108,6 +111,7 @@ int main(int argc, char* argv[]) {
|
||||
stopwatch.start();
|
||||
stretch.presetDefault(int(inWav.channels), inWav.sampleRate, splitComputation);
|
||||
stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
|
||||
stretch.setFormantSemitones(formants, formantComp);
|
||||
double initSeconds = stopwatch.lap();
|
||||
|
||||
initMemory = initMemory.diff();
|
||||
|
||||
@ -2,7 +2,9 @@
|
||||
#define SIGNALSMITH_STRETCH_H
|
||||
|
||||
#include "signalsmith-linear/stft.h" // https://github.com/Signalsmith-Audio/linear
|
||||
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
@ -34,7 +36,7 @@ struct SignalsmithStretch {
|
||||
|
||||
SignalsmithStretch() : randomEngine(std::random_device{}()) {}
|
||||
SignalsmithStretch(long seed) : randomEngine(seed) {}
|
||||
|
||||
|
||||
int blockSamples() const {
|
||||
return int(stft.blockSamples());
|
||||
}
|
||||
@ -57,8 +59,8 @@ struct SignalsmithStretch {
|
||||
channelBands.assign(channelBands.size(), Band());
|
||||
silenceCounter = 0;
|
||||
didSeek = false;
|
||||
|
||||
blockProcess = {};
|
||||
freqEstimateWeighted = freqEstimateWeight = 0;
|
||||
}
|
||||
|
||||
// Configures using a default preset
|
||||
@ -90,6 +92,7 @@ struct SignalsmithStretch {
|
||||
channelPredictions.resize(channels*bands);
|
||||
|
||||
blockProcess = {};
|
||||
formantMetric.resize(bands + 2);
|
||||
}
|
||||
|
||||
/// Frequency multiplier, and optional tonality limit (as multiple of sample-rate)
|
||||
@ -104,13 +107,20 @@ struct SignalsmithStretch {
|
||||
}
|
||||
void setTransposeSemitones(Sample semitones, Sample tonalityLimit=0) {
|
||||
setTransposeFactor(std::pow(2, semitones/12), tonalityLimit);
|
||||
customFreqMap = nullptr;
|
||||
}
|
||||
// Sets a custom frequency map - should be monotonically increasing
|
||||
void setFreqMap(std::function<Sample(Sample)> inputToOutput) {
|
||||
customFreqMap = inputToOutput;
|
||||
}
|
||||
|
||||
void setFormantFactor(Sample multiplier, bool compensatePitch=false) {
|
||||
formantMultiplier = multiplier;
|
||||
formantCompensation = compensatePitch;
|
||||
}
|
||||
void setFormantSemitones(Sample semitones, bool compensatePitch=false) {
|
||||
setFormantFactor(std::pow(2, semitones/12), compensatePitch);
|
||||
}
|
||||
|
||||
// Provide previous input ("pre-roll"), without affecting the speed calculation. You should ideally feed it one block-length + one interval
|
||||
template<class Inputs>
|
||||
void seek(Inputs &&inputs, int inputSamples, double playbackRate) {
|
||||
@ -240,6 +250,9 @@ struct SignalsmithStretch {
|
||||
// analyse a new input
|
||||
blockProcess.steps += stft.analyseSteps() + 1;
|
||||
}
|
||||
|
||||
blockProcess.processFormants = formantMultiplier != 1 || (formantCompensation && blockProcess.mappedFrequencies);
|
||||
if (blockProcess.processFormants) ++blockProcess.steps;
|
||||
|
||||
blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
|
||||
didSeek = false;
|
||||
@ -394,6 +407,7 @@ private:
|
||||
bool newSpectrum = false;
|
||||
bool reanalysePrev = false;
|
||||
bool mappedFrequencies = false;
|
||||
bool processFormants = false;
|
||||
Sample timeFactor;
|
||||
} blockProcess;
|
||||
|
||||
@ -405,6 +419,9 @@ private:
|
||||
|
||||
Sample freqMultiplier = 1, freqTonalityLimit = 0.5;
|
||||
std::function<Sample(Sample)> customFreqMap = nullptr;
|
||||
|
||||
bool formantCompensation = false; // compensate for pitch/freq change
|
||||
Sample formantMultiplier = 1;
|
||||
|
||||
using STFT = signalsmith::linear::DynamicSTFT<Sample, false, true>;
|
||||
STFT stft;
|
||||
@ -568,12 +585,20 @@ private:
|
||||
bins[b].inputEnergy = _impl::norm(bins[b].input);
|
||||
}
|
||||
}
|
||||
|
||||
for (int b = 0; b < bands; ++b) {
|
||||
outputMap[b] = {Sample(b), 1};
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (blockProcess.processFormants) {
|
||||
if (step-- == 0) {
|
||||
updateFormants(0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Preliminary output prediction from phase-vocoder
|
||||
if (step < size_t(channels)) {
|
||||
int c = int(step);
|
||||
Band *bins = bandsForChannel(c);
|
||||
@ -796,6 +821,101 @@ private:
|
||||
outputMap[b] = {b + topOffset, 1};
|
||||
}
|
||||
}
|
||||
|
||||
Sample freqEstimateWeighted = 0;
|
||||
Sample freqEstimateWeight = 0;
|
||||
|
||||
std::vector<Sample> formantMetric;
|
||||
void updateFormants(size_t) {
|
||||
for (auto &e : formantMetric) e = 0;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Band *bins = bandsForChannel(c);
|
||||
for (int b = 0; b < bands; ++b) {
|
||||
formantMetric[b] += bins[b].inputEnergy;
|
||||
}
|
||||
}
|
||||
|
||||
// 3 highest peaks in the input
|
||||
std::array<int, 3> peakIndices{0, 0, 0};
|
||||
for (int b = 1; b < bands - 1; ++b) {
|
||||
Sample e = formantMetric[b];
|
||||
// local maxima only
|
||||
if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue;
|
||||
|
||||
if (e > formantMetric[peakIndices[0]]) {
|
||||
if (e > formantMetric[peakIndices[1]]) {
|
||||
if (e > formantMetric[peakIndices[2]]) {
|
||||
peakIndices = {peakIndices[1], peakIndices[2], b};
|
||||
} else {
|
||||
peakIndices = {peakIndices[1], b, peakIndices[2]};
|
||||
}
|
||||
} else {
|
||||
peakIndices[0] = b;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// VERY rough pitch estimation
|
||||
int peakEstimate = peakIndices[2];
|
||||
if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) {
|
||||
int diff = std::abs(peakEstimate - peakIndices[1]);
|
||||
if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
|
||||
if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) {
|
||||
int diff = std::abs(peakEstimate - peakIndices[0]);
|
||||
if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
|
||||
}
|
||||
}
|
||||
Sample weight = formantMetric[peakIndices[2]];
|
||||
// Smooth it out a bit
|
||||
freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25;
|
||||
freqEstimateWeight += (weight - freqEstimateWeight)*0.25;
|
||||
Sample freqEstimate = freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30));
|
||||
|
||||
for (int b = 0; b < bands; ++b) {
|
||||
formantMetric[b] = std::sqrt(std::sqrt(formantMetric[b]));
|
||||
}
|
||||
Sample slew = 1/(freqEstimate*0.5 + 1);
|
||||
Sample e = 0;
|
||||
for (int repeat = 0; repeat < 1; ++repeat) {
|
||||
for (int b = bands - 1; b >= 0; --b) {
|
||||
e += (formantMetric[b] - e)*slew;
|
||||
formantMetric[b] = e;
|
||||
}
|
||||
for (int b = 0; b < bands; ++b) {
|
||||
e += (formantMetric[b] - e)*slew;
|
||||
formantMetric[b] = e;
|
||||
}
|
||||
}
|
||||
|
||||
auto getFormant = [&](Sample band) -> Sample {
|
||||
if (band < 0) return 0;
|
||||
band = std::min<Sample>(band, bands);
|
||||
int floorBand = std::floor(band);
|
||||
Sample fracBand = band - floorBand;
|
||||
Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1];
|
||||
return low + (high - low)*fracBand;
|
||||
};
|
||||
|
||||
Sample formantMultiplierInv = 1/formantMultiplier;
|
||||
|
||||
for (int b = 0; b < bands; ++b) {
|
||||
Sample inputF = bandToFreq(b);
|
||||
Sample outputF = formantCompensation ? mapFreq(inputF) : inputF;
|
||||
outputF *= formantMultiplierInv;
|
||||
|
||||
Sample inputE = formantMetric[b];
|
||||
Sample targetE = getFormant(freqToBand(outputF));
|
||||
|
||||
Sample formantRatio = targetE/(inputE + Sample(1e-30));
|
||||
Sample energyRatio = (formantRatio*formantRatio)*(formantRatio*formantRatio);
|
||||
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
Band *bins = bandsForChannel(c);
|
||||
// This is what's used to decide the output energy, so this affects the output
|
||||
bins[b].inputEnergy *= energyRatio;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
}} // namespace
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user