Skip to content

Commit 88c4543

Browse files
authored
benchmark: stop timing after stable pairwise comparisons (#4685)
Collect timing rounds until every pairwise revision comparison has a narrow enough 95% confidence interval around mean speedup, or until the benchmark reaches the maxTime budget. Keep maxTime as a hard cap while using minTimingSamplesPerBenchmark as a best-effort minimum before dynamic stopping can take effect.
1 parent aed7c65 commit 88c4543

3 files changed

Lines changed: 82 additions & 8 deletions

File tree

resources/benchmark/config.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@ export const LOCAL = 'local';
44
// The maximum total time in seconds spent collecting timing samples
55
// across all revisions for one benchmark.
66
export const maxTime = 60;
7-
// The minimum sample size required to perform statistical analysis.
8-
export const minSamples = 5;
7+
// The minimum sample size to collect for each revision before allowing
8+
// dynamic stopping. maxTime remains a hard upper bound.
9+
export const minTimingSamplesPerBenchmark = 10;
10+
// Stop timing once every pairwise revision comparison has a 95% confidence
11+
// interval this narrow, measured as relative percent error around the mean ratio.
12+
export const targetPairwiseComparisonIntervalHalfWidth = 2;
913

1014
export const memorySamplesPerBenchmark = 10;
1115

resources/benchmark/run.ts

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,17 @@ import assert from 'node:assert';
22
import path from 'node:path';
33

44
import { getArguments } from './args.js';
5-
import { maxTime, memorySamplesPerBenchmark, minSamples } from './config.js';
5+
import {
6+
maxTime,
7+
memorySamplesPerBenchmark,
8+
minTimingSamplesPerBenchmark,
9+
} from './config.js';
610
import { cyan, printBenchmarkResults, red } from './output.js';
711
import { prepareBenchmarkProjects } from './projects.js';
8-
import { computeStats } from './statistics.js';
12+
import {
13+
computeStats,
14+
havePairwiseComparisonsStabilized,
15+
} from './statistics.js';
916
import type { BenchmarkProject, BenchmarkResult } from './types.js';
1017
import {
1118
getBenchmarkName,
@@ -96,11 +103,18 @@ function collectTimingSamples(
96103
modulePath: path.join(project.projectPath, benchmark),
97104
samples: new Array<number>(),
98105
}));
106+
const timingSamples = sampleGroups.map(({ samples }) => samples);
99107

100-
// If time permits, increase sample size to reduce the margin of error.
108+
// Start new timing rounds only while the total budget remains. Within that
109+
// budget, collect the minimum sample size before checking whether every
110+
// pairwise revision comparison has stabilized.
101111
const start = Date.now();
102112
let round = 0;
103-
while (round < minSamples || (Date.now() - start) / 1e3 < maxTime) {
113+
while (
114+
(Date.now() - start) / 1e3 < maxTime &&
115+
(round < minTimingSamplesPerBenchmark ||
116+
!havePairwiseComparisonsStabilized(timingSamples))
117+
) {
104118
for (const sampleGroup of shuffled(sampleGroups)) {
105119
try {
106120
const sample = sampleTimingModule(sampleGroup.modulePath);
@@ -120,7 +134,7 @@ function collectTimingSamples(
120134
' completed ' + cyan(round) + ' timing rounds...\u000D',
121135
);
122136
}
123-
return sampleGroups.map(({ samples }) => samples);
137+
return timingSamples;
124138
}
125139

126140
function shuffled<T>(array: ReadonlyArray<T>): Array<T> {

resources/benchmark/statistics.ts

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import assert from 'node:assert';
22

3-
import { NS_PER_SEC } from './config.js';
3+
import {
4+
NS_PER_SEC,
5+
targetPairwiseComparisonIntervalHalfWidth,
6+
} from './config.js';
47
import type { BenchmarkResult } from './types.js';
58

69
// T-Distribution two-tailed critical values for 95% confidence.
@@ -38,6 +41,59 @@ export function computeStats(
3841
};
3942
}
4043

44+
export function havePairwiseComparisonsStabilized(
45+
timingSamplesByRevision: ReadonlyArray<ReadonlyArray<number>>,
46+
): boolean {
47+
for (
48+
let baselineIndex = 1;
49+
baselineIndex < timingSamplesByRevision.length;
50+
++baselineIndex
51+
) {
52+
const baselineSamples = timingSamplesByRevision[baselineIndex];
53+
54+
for (
55+
let revisionIndex = 0;
56+
revisionIndex < baselineIndex;
57+
++revisionIndex
58+
) {
59+
const ciHalfWidthPercent = computeLogRatioRelativeMarginOfError(
60+
getRoundLogRatios(
61+
baselineSamples,
62+
timingSamplesByRevision[revisionIndex],
63+
),
64+
);
65+
if (
66+
ciHalfWidthPercent == null ||
67+
ciHalfWidthPercent > targetPairwiseComparisonIntervalHalfWidth
68+
) {
69+
return false;
70+
}
71+
}
72+
}
73+
74+
return true;
75+
}
76+
77+
function computeLogRatioRelativeMarginOfError(
78+
logRatios: ReadonlyArray<number>,
79+
): number | undefined {
80+
const { marginOfError } = computeMeanStats(logRatios);
81+
return Math.expm1(marginOfError) * 100;
82+
}
83+
84+
function getRoundLogRatios(
85+
baselineSamples: ReadonlyArray<number>,
86+
samples: ReadonlyArray<number>,
87+
): Array<number> {
88+
const logRatios: Array<number> = [];
89+
const numSamplePairs = Math.min(baselineSamples.length, samples.length);
90+
for (let index = 0; index < numSamplePairs; ++index) {
91+
// Positive values mean the candidate revision is faster than the baseline.
92+
logRatios.push(Math.log(baselineSamples[index] / samples[index]));
93+
}
94+
return logRatios;
95+
}
96+
4197
function computeMeanStats(samples: ReadonlyArray<number>): {
4298
mean: number;
4399
marginOfError: number;

0 commit comments

Comments
 (0)