Skip to content

Commit 1aa5a03

Browse files
authored
benchmark: report paired benchmark comparisons (#4689)
1 parent 4dddbc9 commit 1aa5a03

5 files changed

Lines changed: 227 additions & 28 deletions

File tree

resources/benchmark/config.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ export const targetPairwiseComparisonIntervalHalfWidth = 2;
1313

1414
export const memorySamplesPerBenchmark = 10;
1515

16+
export const pairedGreenThreshold = 0.95;
17+
export const pairedYellowThreshold = 0.8;
18+
1619
export const timingBenchmarkNodeFlags: ReadonlyArray<string> = ['--expose-gc'];
1720

1821
export const memoryBenchmarkNodeFlags: ReadonlyArray<string> = [

resources/benchmark/output.ts

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import type { BenchmarkResult } from './types.js';
1+
import { pairedGreenThreshold, pairedYellowThreshold } from './config.js';
2+
import type { BenchmarkResult, PairedComparison } from './types.js';
3+
4+
type ColorFn = (value: number | string) => string;
25

36
export function printBenchmarkResults(
47
results: ReadonlyArray<BenchmarkResult>,
@@ -54,6 +57,54 @@ export function printBenchmarkResults(
5457
}
5558
}
5659

60+
export function printPairedComparisons(
61+
pairedComparisons: ReadonlyArray<PairedComparison>,
62+
): void {
63+
if (pairedComparisons.length === 0) {
64+
return;
65+
}
66+
67+
console.log(' ' + grey('paired round-by-round ops/sec changes:'));
68+
69+
const leftMaxLen = maxBy(
70+
pairedComparisons,
71+
({ baselineRevision }) => baselineRevision.length,
72+
);
73+
const rightMaxLen = maxBy(
74+
pairedComparisons,
75+
({ revision }) => revision.length,
76+
);
77+
const speedupMaxLen = maxBy(
78+
pairedComparisons,
79+
({ speedupPercent }) => formatSignedPercent(speedupPercent).length,
80+
);
81+
const ciMaxLen = maxBy(
82+
pairedComparisons,
83+
(comparison) => formatConfidenceInterval(comparison).length,
84+
);
85+
86+
for (const paired of pairedComparisons) {
87+
const speedupColorFn = pairedSpeedupColorFn(paired);
88+
console.log(
89+
' ' +
90+
paired.baselineRevision.padEnd(leftMaxLen) +
91+
' -> ' +
92+
paired.revision.padEnd(rightMaxLen) +
93+
grey(' x ') +
94+
speedupColorFn(
95+
formatSignedPercent(paired.speedupPercent).padStart(speedupMaxLen),
96+
) +
97+
' ops/sec change ' +
98+
grey(
99+
'(95% CI ' +
100+
formatConfidenceInterval(paired).padStart(ciMaxLen) +
101+
', ' +
102+
paired.numPairs +
103+
' paired runs)',
104+
),
105+
);
106+
}
107+
}
57108
function beautifyBytes(bytes: number): string {
58109
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
59110
const i = Math.floor(Math.log2(bytes) / 10);
@@ -64,10 +115,52 @@ function beautifyNumber(num: number): string {
64115
return Number(num.toFixed(num > 100 ? 0 : 2)).toLocaleString();
65116
}
66117

118+
function formatSignedPercent(num: number): string {
119+
const rounded = Number(num.toFixed(2));
120+
const sign = rounded > 0 ? '+' : '';
121+
return sign + rounded.toFixed(2) + '%';
122+
}
123+
124+
function formatConfidenceInterval({
125+
ciLowPercent,
126+
ciHighPercent,
127+
}: PairedComparison): string {
128+
return (
129+
formatSignedPercent(ciLowPercent) +
130+
' to ' +
131+
formatSignedPercent(ciHighPercent)
132+
);
133+
}
134+
67135
function maxBy<T>(array: ReadonlyArray<T>, fn: (obj: T) => number): number {
68136
return Math.max(...array.map(fn));
69137
}
70138

139+
function pairedSpeedupColorFn({
140+
speedupPercent,
141+
ciLowPercent,
142+
ciHighPercent,
143+
}: PairedComparison): ColorFn {
144+
if (ciLowPercent <= 0 && ciHighPercent >= 0) {
145+
return grey;
146+
}
147+
148+
const relativeOps = 1 + speedupPercent / 100;
149+
if (speedupPercent < 0) {
150+
if (relativeOps > pairedGreenThreshold) {
151+
return grey;
152+
}
153+
154+
if (relativeOps > pairedYellowThreshold) {
155+
return yellow;
156+
}
157+
158+
return red;
159+
}
160+
161+
return green;
162+
}
163+
71164
export function bold(str: number | string): string {
72165
return '\u001b[1m' + str + '\u001b[0m';
73166
}

resources/benchmark/run.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,16 @@ import {
77
memorySamplesPerBenchmark,
88
minTimingSamplesPerBenchmark,
99
} from './config.js';
10-
import { cyan, printBenchmarkResults, red } from './output.js';
10+
import {
11+
cyan,
12+
printBenchmarkResults,
13+
printPairedComparisons,
14+
red,
15+
} from './output.js';
1116
import { prepareBenchmarkProjects } from './projects.js';
1217
import {
1318
computeStats,
19+
getPairedComparisons,
1420
havePairwiseComparisonsStabilized,
1521
} from './statistics.js';
1622
import type { BenchmarkProject, BenchmarkResult } from './types.js';
@@ -91,6 +97,12 @@ function runBenchmark(
9197
console.log('\n');
9298

9399
printBenchmarkResults(results);
100+
printPairedComparisons(
101+
getPairedComparisons(
102+
benchmarkProjects.map(({ revision }) => revision),
103+
timingSamples,
104+
),
105+
);
94106
console.log('');
95107
}
96108

resources/benchmark/statistics.ts

Lines changed: 107 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import {
44
NS_PER_SEC,
55
targetPairwiseComparisonIntervalHalfWidth,
66
} from './config.js';
7-
import type { BenchmarkResult } from './types.js';
7+
import type { BenchmarkResult, PairedComparison } from './types.js';
88

99
// T-Distribution two-tailed critical values for 95% confidence.
1010
// See http://www.itl.nist.gov/div898/handbook/eda/section3/eda3672.htm.
@@ -18,29 +18,67 @@ const tTable: { [v: number]: number } = {
1818
};
1919
const tTableInfinity = 1.96;
2020

21+
interface LogRatioStats {
22+
meanRatio: number;
23+
lowRatio: number;
24+
highRatio: number;
25+
numSamples: number;
26+
}
27+
2128
// Computes stats on benchmark results.
2229
export function computeStats(
2330
name: string,
2431
timingSamples: ReadonlyArray<number>,
2532
memorySamples: ReadonlyArray<number>,
2633
): BenchmarkResult {
27-
const { mean, marginOfError } = computeMeanStats(timingSamples);
28-
29-
let meanMemUsed = 0;
30-
for (const memUsed of memorySamples) {
31-
meanMemUsed += memUsed;
32-
}
33-
meanMemUsed /= memorySamples.length;
34+
const { mean } = computeMeanStats(timingSamples);
3435

3536
return {
3637
name,
37-
memPerOp: Math.floor(meanMemUsed),
38+
memPerOp: Math.floor(computeMean(memorySamples)),
3839
ops: NS_PER_SEC / mean,
39-
deviation: (marginOfError / mean) * 100 || 0,
40+
deviation: computeRelativeMarginOfError(timingSamples),
4041
numSamples: timingSamples.length,
4142
};
4243
}
4344

45+
export function getPairedComparisons(
46+
revisions: ReadonlyArray<string>,
47+
timingSamplesByRevision: ReadonlyArray<ReadonlyArray<number>>,
48+
): Array<PairedComparison> {
49+
const pairedComparisons: Array<PairedComparison> = [];
50+
51+
for (
52+
let baselineIndex = 1;
53+
baselineIndex < timingSamplesByRevision.length;
54+
++baselineIndex
55+
) {
56+
const baselineSamples = timingSamplesByRevision[baselineIndex];
57+
58+
for (
59+
let revisionIndex = 0;
60+
revisionIndex < baselineIndex;
61+
++revisionIndex
62+
) {
63+
const paired = computePairedComparison(
64+
baselineSamples,
65+
timingSamplesByRevision[revisionIndex],
66+
);
67+
if (paired == null) {
68+
continue;
69+
}
70+
71+
pairedComparisons.push({
72+
baselineRevision: revisions[baselineIndex],
73+
revision: revisions[revisionIndex],
74+
...paired,
75+
});
76+
}
77+
}
78+
79+
return pairedComparisons;
80+
}
81+
4482
export function havePairwiseComparisonsStabilized(
4583
timingSamplesByRevision: ReadonlyArray<ReadonlyArray<number>>,
4684
): boolean {
@@ -56,15 +94,13 @@ export function havePairwiseComparisonsStabilized(
5694
revisionIndex < baselineIndex;
5795
++revisionIndex
5896
) {
59-
const ciHalfWidthPercent = computeLogRatioRelativeMarginOfError(
60-
getRoundLogRatios(
61-
baselineSamples,
62-
timingSamplesByRevision[revisionIndex],
63-
),
97+
const paired = computePairedComparison(
98+
baselineSamples,
99+
timingSamplesByRevision[revisionIndex],
64100
);
65101
if (
66-
ciHalfWidthPercent == null ||
67-
ciHalfWidthPercent > targetPairwiseComparisonIntervalHalfWidth
102+
paired == null ||
103+
paired.ciHalfWidthPercent > targetPairwiseComparisonIntervalHalfWidth
68104
) {
69105
return false;
70106
}
@@ -74,11 +110,52 @@ export function havePairwiseComparisonsStabilized(
74110
return true;
75111
}
76112

77-
function computeLogRatioRelativeMarginOfError(
113+
function computeRelativeMarginOfError(samples: ReadonlyArray<number>): number {
114+
const { mean, marginOfError } = computeMeanStats(samples);
115+
return (marginOfError / mean) * 100 || 0;
116+
}
117+
118+
function computeLogRatioStats(
78119
logRatios: ReadonlyArray<number>,
79-
): number | undefined {
80-
const { marginOfError } = computeMeanStats(logRatios);
81-
return Math.expm1(marginOfError) * 100;
120+
): LogRatioStats | undefined {
121+
if (logRatios.length < 2) {
122+
return;
123+
}
124+
125+
const { mean, marginOfError } = computeMeanStats(logRatios);
126+
return {
127+
meanRatio: Math.exp(mean),
128+
lowRatio: Math.exp(mean - marginOfError),
129+
highRatio: Math.exp(mean + marginOfError),
130+
numSamples: logRatios.length,
131+
};
132+
}
133+
134+
function computePairedComparison(
135+
baselineSamples: ReadonlyArray<number>,
136+
samples: ReadonlyArray<number>,
137+
): Omit<PairedComparison, 'baselineRevision' | 'revision'> | undefined {
138+
const logRatioStats = computeLogRatioStats(
139+
getRoundLogRatios(baselineSamples, samples),
140+
);
141+
if (logRatioStats == null) {
142+
return;
143+
}
144+
145+
const speedupPercent = (logRatioStats.meanRatio - 1) * 100;
146+
const ciLowPercent = (logRatioStats.lowRatio - 1) * 100;
147+
const ciHighPercent = (logRatioStats.highRatio - 1) * 100;
148+
149+
return {
150+
speedupPercent,
151+
ciLowPercent,
152+
ciHighPercent,
153+
ciHalfWidthPercent: Math.max(
154+
Math.abs(speedupPercent - ciLowPercent),
155+
Math.abs(ciHighPercent - speedupPercent),
156+
),
157+
numPairs: logRatioStats.numSamples,
158+
};
82159
}
83160

84161
function getRoundLogRatios(
@@ -94,17 +171,21 @@ function getRoundLogRatios(
94171
return logRatios;
95172
}
96173

174+
function computeMean(samples: ReadonlyArray<number>): number {
175+
let mean = 0;
176+
for (const sample of samples) {
177+
mean += sample;
178+
}
179+
return mean / samples.length;
180+
}
181+
97182
function computeMeanStats(samples: ReadonlyArray<number>): {
98183
mean: number;
99184
marginOfError: number;
100185
} {
101186
assert(samples.length > 1);
102187

103-
let mean = 0;
104-
for (const sample of samples) {
105-
mean += sample;
106-
}
107-
mean /= samples.length;
188+
const mean = computeMean(samples);
108189

109190
let variance = 0;
110191
for (const sample of samples) {

resources/benchmark/types.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,13 @@ export interface BenchmarkResult {
1010
deviation: number;
1111
numSamples: number;
1212
}
13+
14+
export interface PairedComparison {
15+
baselineRevision: string;
16+
revision: string;
17+
speedupPercent: number;
18+
ciLowPercent: number;
19+
ciHighPercent: number;
20+
ciHalfWidthPercent: number;
21+
numPairs: number;
22+
}

0 commit comments

Comments
 (0)