benchmark: stop timing after stable pairwise comparisons (#4685)

yaacovCR · web-flow · commit 88c4543d460c · 2026-04-23T22:07:54.000+03:00
Collect timing rounds until every pairwise revision comparison has a narrow enough 95% confidence interval around mean speedup, or until the benchmark reaches the maxTime budget.

Keep maxTime as a hard cap while using minTimingSamplesPerBenchmark as a best-effort minimum before dynamic stopping can take effect.
diff --git a/resources/benchmark/config.ts b/resources/benchmark/config.ts
@@ -4,8 +4,12 @@ export const LOCAL = 'local';
 // The maximum total time in seconds spent collecting timing samples
 // across all revisions for one benchmark.
 export const maxTime = 60;
-// The minimum sample size required to perform statistical analysis.
-export const minSamples = 5;
+// The minimum sample size to collect for each revision before allowing
+// dynamic stopping. maxTime remains a hard upper bound.
+export const minTimingSamplesPerBenchmark = 10;
+// Stop timing once every pairwise revision comparison has a 95% confidence
+// interval this narrow, measured as relative percent error around the mean ratio.
+export const targetPairwiseComparisonIntervalHalfWidth = 2;
 
 export const memorySamplesPerBenchmark = 10;
 
diff --git a/resources/benchmark/run.ts b/resources/benchmark/run.ts
@@ -2,10 +2,17 @@ import assert from 'node:assert';
 import path from 'node:path';
 
 import { getArguments } from './args.js';
-import { maxTime, memorySamplesPerBenchmark, minSamples } from './config.js';
+import {
+  maxTime,
+  memorySamplesPerBenchmark,
+  minTimingSamplesPerBenchmark,
+} from './config.js';
 import { cyan, printBenchmarkResults, red } from './output.js';
 import { prepareBenchmarkProjects } from './projects.js';
-import { computeStats } from './statistics.js';
+import {
+  computeStats,
+  havePairwiseComparisonsStabilized,
+} from './statistics.js';
 import type { BenchmarkProject, BenchmarkResult } from './types.js';
 import {
   getBenchmarkName,
@@ -96,11 +103,18 @@ function collectTimingSamples(
     modulePath: path.join(project.projectPath, benchmark),
     samples: new Array<number>(),
   }));
+  const timingSamples = sampleGroups.map(({ samples }) => samples);
 
-  // If time permits, increase sample size to reduce the margin of error.
+  // Start new timing rounds only while the total budget remains. Within that
+  // budget, collect the minimum sample size before checking whether every
+  // pairwise revision comparison has stabilized.
   const start = Date.now();
   let round = 0;
-  while (round < minSamples || (Date.now() - start) / 1e3 < maxTime) {
+  while (
+    (Date.now() - start) / 1e3 < maxTime &&
+    (round < minTimingSamplesPerBenchmark ||
+      !havePairwiseComparisonsStabilized(timingSamples))
+  ) {
     for (const sampleGroup of shuffled(sampleGroups)) {
       try {
         const sample = sampleTimingModule(sampleGroup.modulePath);
@@ -120,7 +134,7 @@ function collectTimingSamples(
       '  completed ' + cyan(round) + ' timing rounds...\u000D',
     );
   }
-  return sampleGroups.map(({ samples }) => samples);
+  return timingSamples;
 }
 
 function shuffled<T>(array: ReadonlyArray<T>): Array<T> {
diff --git a/resources/benchmark/statistics.ts b/resources/benchmark/statistics.ts
@@ -1,6 +1,9 @@
 import assert from 'node:assert';
 
-import { NS_PER_SEC } from './config.js';
+import {
+  NS_PER_SEC,
+  targetPairwiseComparisonIntervalHalfWidth,
+} from './config.js';
 import type { BenchmarkResult } from './types.js';
 
 // T-Distribution two-tailed critical values for 95% confidence.
@@ -38,6 +41,59 @@ export function computeStats(
   };
 }
 
+export function havePairwiseComparisonsStabilized(
+  timingSamplesByRevision: ReadonlyArray<ReadonlyArray<number>>,
+): boolean {
+  for (
+    let baselineIndex = 1;
+    baselineIndex < timingSamplesByRevision.length;
+    ++baselineIndex
+  ) {
+    const baselineSamples = timingSamplesByRevision[baselineIndex];
+
+    for (
+      let revisionIndex = 0;
+      revisionIndex < baselineIndex;
+      ++revisionIndex
+    ) {
+      const ciHalfWidthPercent = computeLogRatioRelativeMarginOfError(
+        getRoundLogRatios(
+          baselineSamples,
+          timingSamplesByRevision[revisionIndex],
+        ),
+      );
+      if (
+        ciHalfWidthPercent == null ||
+        ciHalfWidthPercent > targetPairwiseComparisonIntervalHalfWidth
+      ) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+function computeLogRatioRelativeMarginOfError(
+  logRatios: ReadonlyArray<number>,
+): number | undefined {
+  const { marginOfError } = computeMeanStats(logRatios);
+  return Math.expm1(marginOfError) * 100;
+}
+
+function getRoundLogRatios(
+  baselineSamples: ReadonlyArray<number>,
+  samples: ReadonlyArray<number>,
+): Array<number> {
+  const logRatios: Array<number> = [];
+  const numSamplePairs = Math.min(baselineSamples.length, samples.length);
+  for (let index = 0; index < numSamplePairs; ++index) {
+    // Positive values mean the candidate revision is faster than the baseline.
+    logRatios.push(Math.log(baselineSamples[index] / samples[index]));
+  }
+  return logRatios;
+}
+
 function computeMeanStats(samples: ReadonlyArray<number>): {
   mean: number;
   marginOfError: number;