Split .processSpectrum() into more steps

This commit is contained in:
Geraint 2025-02-21 14:47:56 +00:00
parent f72c4f0985
commit 46d866e9fe
5 changed files with 338 additions and 190 deletions

View File

@ -7,7 +7,7 @@ include(FetchContent)
FetchContent_Declare(
signalsmith-linear
GIT_REPOSITORY https://github.com/Signalsmith-Audio/linear.git
GIT_TAG c600e0420d260469566c41e1ccb64f89ee439dd3
GIT_TAG 0.1.0
GIT_SHALLOW ON
)
FetchContent_MakeAvailable(signalsmith-linear)

View File

@ -2,6 +2,16 @@
#include <iostream>
#define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << "\n";
size_t activeStepIndex = 0;
void profileProcessStart(int, int);
void profileProcessEndStep();
void profileProcessStep(size_t, size_t);
void profileProcessEnd();
#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_START profileProcessStart
#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP profileProcessStep
#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP profileProcessEndStep
#define SIGNALSMITH_STRETCH_PROFILE_PROCESS_END profileProcessEnd
#include "signalsmith-stretch/signalsmith-stretch.h"
#include "./util/stopwatch.h"
@ -9,9 +19,44 @@
#include "./util/simple-args.h"
#include "./util/wav.h"
#include "plot/plot.h"
std::vector<signalsmith::Stopwatch> processStopwatches;
signalsmith::Stopwatch processStopwatchStart, processStopwatchEnd;
bool started = false;
bool activeStep = false;
void profileProcessStart(int /*inputSamples*/, int /*outputSamples*/) {
activeStep = false;
started = true;
processStopwatchStart.startLap();
}
void profileProcessEndStep() {
if (activeStep) {
activeStep = false;
processStopwatches[activeStepIndex].lap();
} else if (started) {
started = false;
processStopwatchStart.lap();
}
processStopwatchEnd.startLap();
}
void profileProcessStep(size_t step, size_t count) {
profileProcessEndStep();
activeStep = true;
activeStepIndex = step;
if (processStopwatches.size() < count) {
processStopwatches.resize(count);
}
processStopwatches[step].startLap();
}
void profileProcessEnd() {
processStopwatchEnd.lap();
}
int main(int argc, char* argv[]) {
signalsmith::stretch::SignalsmithStretch<float/*, std::ranlux48_base*/> stretch; // optional cheaper RNG for performance comparison
processStopwatches.reserve(1000);
SimpleArgs args(argc, argv);
if (args.hasFlag("v", "prints the version")) {
@ -56,7 +101,7 @@ int main(int argc, char* argv[]) {
stopwatch.start();
stretch.presetDefault(inWav.channels, inWav.sampleRate);
stretch.setTransposeSemitones(semitones, tonality/inWav.sampleRate);
double initSeconds = stopwatch.seconds(stopwatch.lap());
double initSeconds = stopwatch.lap();
initMemory = initMemory.diff();
std::cout << "Setup:\n\t" << initSeconds << "s\n";
@ -85,7 +130,7 @@ int main(int argc, char* argv[]) {
stretch.flush(outWav, tailSamples);
outWav.offset -= outputLength;
double processSeconds = stopwatch.seconds(stopwatch.lap());
double processSeconds = stopwatch.lap();
double processRate = (inWav.length()/inWav.sampleRate)/processSeconds;
double processPercent = 100/processRate;
processMemory = processMemory.diff();
@ -109,6 +154,31 @@ int main(int argc, char* argv[]) {
// the `.flush()` call already handled foldback stuff at the end (since we asked for a shorter `tailSamples`)
}
signalsmith::plot::Plot2D plot(400, 150);
plot.x.major(0, "").label("step");
plot.y.major(0).label("time spent");
auto &line = plot.line().fillToY(0);
auto &extraLine = plot.line().fillToY(0);
for (size_t i = 0; i < processStopwatches.size(); ++i) {
double time = processStopwatches[i].total();
if (i%5 == 0) {
plot.x.tick(i + 0.5, std::to_string(i));
} else {
plot.x.tick(i + 0.5, "");
}
line.add(i, time);
line.add(i + 1, time);
}
extraLine.add(-1, 0);
extraLine.add(-1, processStopwatchStart.total());
extraLine.add(0, processStopwatchStart.total());
extraLine.add(0, 0);
extraLine.add(processStopwatches.size(), 0);
extraLine.add(processStopwatches.size(), processStopwatchEnd.total());
extraLine.add(processStopwatches.size() + 1, processStopwatchEnd.total());
extraLine.add(processStopwatches.size() + 1, 0);
plot.write("profile.svg");
if (!outWav.write(outputWav).warn()) args.errorExit("failed to write WAV");
if (compareReference && prevWav.result) {

34
cmd/util/stop-denormals.h Normal file
View File

@ -0,0 +1,34 @@
#pragma once
#if defined(__SSE__) || defined(_M_X64)
class StopDenormals {
unsigned int controlStatusRegister;
public:
StopDenormals() : controlStatusRegister(_mm_getcsr()) {
_mm_setcsr(controlStatusRegister|0x8040); // Flush-to-Zero and Denormals-Are-Zero
}
~StopDenormals() {
_mm_setcsr(controlStatusRegister);
}
};
#elif (defined (__ARM_NEON) || defined (__ARM_NEON__))
class StopDenormals {
uintptr_t status;
public:
StopDenormals() {
uintptr_t asmStatus;
asm volatile("mrs %0, fpcr" : "=r"(asmStatus));
status = asmStatus = asmStatus|0x01000000U; // Flush to Zero
asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
}
~StopDenormals() {
uintptr_t asmStatus = status;
asm volatile("msr fpcr, %0" : : "ri"(asmStatus));
}
};
#else
# if __cplusplus >= 202302L
# warning "The `StopDenormals` class doesn't do anything for this architecture"
# endif
class StopDenormals {}; // FIXME: add for other architectures
#endif

View File

@ -6,37 +6,40 @@
#include <atomic>
#include <algorithm>
// We want CPU time, not wall-clock time, so we can't use `std::chrono::high_resolution_clock`
#ifdef WINDOWS
#ifdef WINDOWS // completely untested!
# include <windows.h>
namespace signalsmith {
class Stopwatch {
using Time = __int64;
using Duration = Time;
inline Time now() {
LARGE_INTEGER result;
QueryPerformanceCounter(&result);
return result.QuadPart;
}
static double timeToSeconds(double t) {
static double toSeconds(Duration t) {
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
return t/double(freq);
}
#else
# include <ctime>
# include <chrono>
namespace signalsmith {
class Stopwatch {
using Time = std::clock_t;
using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
using Time = Clock::time_point;
using Duration = std::chrono::duration<double>;
inline Time now() {
return std::clock();
return Clock::now();
}
static double timeToSeconds(double t) {
return t/double(CLOCKS_PER_SEC);
static double toSeconds(Duration duration) {
return duration.count();
}
#endif
std::atomic<Time> lapStart; // the atomic store/load should act as barriers for reordering operations
Time lapBest, lapTotal, lapTotal2;
double lapBest, lapTotal, lapTotal2;
double lapOverhead = 0;
int lapCount = 0;
@ -53,23 +56,22 @@ public:
}
start();
}
static double seconds(double time) {
return timeToSeconds(time);
// Explicit because std::atomic<> can't be copied/moved
Stopwatch(const Stopwatch &other) : lapBest(other.lapBest), lapTotal(other.lapTotal), lapTotal2(other.lapTotal2), lapOverhead(other.lapOverhead), lapCount(other.lapCount) {
lapStart.store(other.lapStart.load());
}
void start() {
lapCount = 0;
lapTotal = lapTotal2 = 0;
lapBest = std::numeric_limits<Time>::max();
lapBest = std::numeric_limits<double>::max();
startLap();
}
void startLap() {
lapStart.store(now());
}
double lap() {
auto start = lapStart.load();
auto diff = now() - start;
double diff = toSeconds(now() - lapStart.load());
if (diff < lapBest) lapBest = diff;
lapCount++;
@ -100,5 +102,6 @@ public:
}
};
} // namespace
} //namespace
#endif // include guard

View File

@ -140,8 +140,12 @@ struct SignalsmithStretch {
template<class Inputs, class Outputs>
void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_START
SIGNALSMITH_STRETCH_PROFILE_PROCESS_START(inputSamples, outputSamples);
#endif
int prevCopiedInput = 0;
auto copyInput = [&](int toIndex){
int length = std::min<int>(stft.blockSamples() + stft.defaultInterval(), toIndex - prevCopiedInput);
tmpBuffer.resize(length);
int offset = toIndex - length;
@ -164,6 +168,7 @@ struct SignalsmithStretch {
totalEnergy += s*s;
}
}
if (totalEnergy < noiseFloor) {
if (silenceCounter >= 2*stft.blockSamples()) {
if (silenceFirst) { // first block of silence processing
@ -209,6 +214,9 @@ struct SignalsmithStretch {
size_t processToStep = std::min<size_t>(blockProcess.steps, blockProcess.steps*processRatio);
while (blockProcess.step < processToStep) {
size_t step = blockProcess.step++;
#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP
SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP(step, blockProcess.steps);
#endif
if (blockProcess.newSpectrum) {
if (blockProcess.reanalysePrev) {
@ -237,7 +245,7 @@ struct SignalsmithStretch {
// Analyse latest (stashed) input
if (step < stft.analyseSteps()) {
stashedInput.swap(stft.input);
stft.analyse();
stft.analyseStep(step);
stashedInput.swap(stft.input);
continue;
}
@ -257,7 +265,7 @@ struct SignalsmithStretch {
}
if (step < processSpectrumSteps) {
processSpectrum(step, blockProcess.newSpectrum, blockProcess.timeFactor);
processSpectrum(step);
continue;
}
step -= processSpectrumSteps;
@ -279,10 +287,7 @@ struct SignalsmithStretch {
stft.synthesiseStep(step);
continue;
}
LOG_EXPR("uh oh");
LOG_EXPR(processToStep);
LOG_EXPR(blockProcess.steps);
LOG_EXPR(blockProcess.step);
// This should never happen - something has gone terribly wrong
abort();
}
if (processRatio >= 1) { // we *should* have just written a block, and are now ready to start a new one
@ -301,9 +306,10 @@ struct SignalsmithStretch {
stft.moveOutput(stft.defaultInterval()); // the actual input jumps forward in time by one interval, ready for the synthesis
blockProcess.newSpectrum = didSeek || (inputInterval > 0);
blockProcess.mappedFrequencies = customFreqMap || freqMultiplier != 1;
if (blockProcess.newSpectrum) {
// make sure the previous input is the correct distance in the past
blockProcess.reanalysePrev = didSeek || inputInterval != int(stft.defaultInterval());
// make sure the previous input is the correct distance in the past (give or take 1 sample)
blockProcess.reanalysePrev = didSeek || std::abs(inputInterval - int(stft.defaultInterval())) > 1;
if (blockProcess.reanalysePrev) blockProcess.steps += stft.analyseSteps() + 1;
// analyse a new input
@ -313,10 +319,14 @@ struct SignalsmithStretch {
blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
didSeek = false;
updateProcessSpectrumSteps();
blockProcess.steps += processSpectrumSteps;
blockProcess.steps += stft.synthesiseSteps() + 1;
}
#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP
SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP();
#endif
++blockProcess.samplesSinceLast;
stashedOutput.swap(stft.output);
@ -332,6 +342,9 @@ struct SignalsmithStretch {
copyInput(inputSamples);
prevInputOffset -= inputSamples;
#ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_END
SIGNALSMITH_STRETCH_PROFILE_PROCESS_END();
#endif
}
// Read the remaining output, providing no further input. `outputSamples` should ideally be at least `.outputLatency()`
@ -372,6 +385,7 @@ private:
bool newSpectrum = false;
bool reanalysePrev = false;
bool mappedFrequencies = false;
Sample timeFactor;
} blockProcess;
@ -477,19 +491,33 @@ private:
RandomEngine randomEngine;
static constexpr size_t processSpectrumSteps = 6;
void processSpectrum(size_t step, bool newSpectrum, Sample timeFactor) {
size_t processSpectrumSteps = 0;
static constexpr size_t splitMainPrediction = 8; // it's just heavy, since we're blending up to 4 different phase predictions
void updateProcessSpectrumSteps() {
processSpectrumSteps = 0;
if (blockProcess.newSpectrum) processSpectrumSteps += channels;
if (blockProcess.mappedFrequencies) {
processSpectrumSteps += smoothEnergySteps;
processSpectrumSteps += 1; // findPeaks
}
processSpectrumSteps += 1; // updating the output map
processSpectrumSteps += channels; // preliminary phase-vocoder prediction
processSpectrumSteps += splitMainPrediction;
if (blockProcess.newSpectrum) processSpectrumSteps += 1; // .input -> .prevInput
}
void processSpectrum(size_t step) {
Sample timeFactor = blockProcess.timeFactor;
Sample smoothingBins = Sample(stft.fftSamples())/stft.defaultInterval();
int longVerticalStep = std::round(smoothingBins);
timeFactor = std::max<Sample>(timeFactor, 1/maxCleanStretch);
bool randomTimeFactor = (timeFactor > maxCleanStretch);
std::uniform_real_distribution<Sample> timeFactorDist(maxCleanStretch*2*randomTimeFactor - timeFactor, timeFactor);
switch(step) {
case 1: {
if (newSpectrum) {
for (int c = 0; c < channels; ++c) {
auto bins = bandsForChannel(c);
if (blockProcess.newSpectrum) {
if (step < size_t(channels)) {
int channel = step;
auto bins = bandsForChannel(channel);
Complex rot = std::polar(Sample(1), bandToFreq(0)*stft.defaultInterval()*Sample(2*M_PI));
Sample freqStep = bandToFreq(1) - bandToFreq(0);
@ -501,18 +529,23 @@ private:
bin.prevInput = _impl::mul(bin.prevInput, rot);
rot = _impl::mul(rot, rotStep);
}
}
}
return;
}
case 2: {
if (customFreqMap || freqMultiplier != 1) {
findPeaks(smoothingBins);
step -= channels;
}
if (blockProcess.mappedFrequencies) {
if (step < smoothEnergySteps) {
smoothEnergy(step, smoothingBins);
return;
}
case 3: {
if (customFreqMap || freqMultiplier != 1) {
step -= smoothEnergySteps;
if (step-- == 0) {
findPeaks();
return;
}
}
if (step-- == 0) {
if (blockProcess.mappedFrequencies) {
updateOutputMap();
} else { // we're not pitch-shifting, so no need to find peaks etc.
for (int c = 0; c < channels; ++c) {
@ -527,9 +560,8 @@ private:
}
return;
}
case 4: {
// Preliminary output prediction from phase-vocoder
for (int c = 0; c < channels; ++c) {
if (step < size_t(channels)) {
size_t c = step;
Band *bins = bandsForChannel(c);
auto *predictions = predictionsForChannel(c);
for (int b = 0; b < bands; ++b) {
@ -549,12 +581,16 @@ private:
Complex phase = _impl::mul(outputBin.output, freqTwist);
outputBin.output = phase/(std::max(prevEnergy, prediction.energy) + noiseFloor);
}
}
return;
}
case 5: {
step -= channels;
if (step < splitMainPrediction) {
// Re-predict using phase differences between frequencies
for (int b = 0; b < bands; ++b) {
int chunk = step;
int startB = bands*chunk/splitMainPrediction;
int endB = bands*(chunk + 1)/splitMainPrediction;
for (int b = startB; b < endB; ++b) {
// Find maximum-energy channel and calculate that
int maxChannel = 0;
Sample maxEnergy = predictionsForChannel(0)[b].energy;
@ -629,20 +665,23 @@ private:
}
}
}
return;
}
step -= splitMainPrediction;
if (newSpectrum) {
if (blockProcess.newSpectrum) {
for (auto &bin : channelBands) {
bin.prevInput = bin.input;
}
}
return;
}
} // switch
}
// Produces smoothed energy across all channels
void smoothEnergy(Sample smoothingBins) {
static constexpr size_t smoothEnergySteps = 3;
Sample smoothEnergyState = 0;
void smoothEnergy(size_t step, Sample smoothingBins) {
Sample smoothingSlew = 1/(1 + smoothingBins*Sample(0.5));
if (step-- == 0) {
for (auto &e : energy) e = 0;
for (int c = 0; c < channels; ++c) {
Band *bins = bandsForChannel(c);
@ -655,8 +694,12 @@ private:
for (int b = 0; b < bands; ++b) {
smoothedEnergy[b] = energy[b];
}
Sample e = 0;
for (int repeat = 0; repeat < 2; ++repeat) {
smoothEnergyState = 0;
return;
}
// The two other steps are repeated smoothing passes, down and up
Sample e = smoothEnergyState;
for (int b = bands - 1; b >= 0; --b) {
e += (smoothedEnergy[b] - e)*smoothingSlew;
smoothedEnergy[b] = e;
@ -665,7 +708,7 @@ private:
e += (smoothedEnergy[b] - e)*smoothingSlew;
smoothedEnergy[b] = e;
}
}
smoothEnergyState = e;
}
Sample mapFreq(Sample freq) const {
@ -678,9 +721,7 @@ private:
}
// Identifies spectral peaks using energy across all channels
void findPeaks(Sample smoothingBins) {
smoothEnergy(smoothingBins);
void findPeaks() {
peaks.resize(0);
int start = 0;