bench_runner.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116
  1. #!/usr/bin/env -S uv run --script
  2. # /// script
  3. # requires-python = ">=3.10"
  4. # dependencies = [
  5. # "numpy",
  6. # "rich",
  7. # "scipy",
  8. # "quantiphy",
  9. # ]
  10. # ///
  11. """Script to run GoogleBenchmark binaries repeatedly and render results.
  12. This script helps run benchmarks repeatedly and render the resulting
  13. measurements in a way that effectively surfaces noisy benchmarks and provides
  14. statistically significant information about the measurements.
  15. There are two primary modes:
  16. 1) Running a single experiment benchmark binary repeatedly to understand that
  17. benchmark's performance.
  18. 2) Running both an experiment and a baseline benchmark binary that include the
  19. same benchmark names to understand the change in performance for each named
  20. benchmark.
  21. Across all of these modes, when rendering a specific metric for a benchmark, we
  22. also render the confidence intervals based on the specified `--alpha` parameter.
  23. For mode (1) when running a single benchmark binary, there is additional support
  24. for passing regular expressions that describe a set of comparable benchmarks for
  25. some main benchmark. When used, the comparable benchmarks for each main one are
  26. rendered as a delta of the main rather than as completely independent metrics.
  27. For mode (2) when running an experiment and baseline binary, every benchmark is
  28. rendered as a delta of the experiment vs. the baseline.
  29. Whenever rendering a delta, this script will flag statistically significant
  30. (according to the provided `--alpha`) improvements or regressions, compute the
  31. improvement or regression, and display the resulting p-value. This script uses
  32. non-parametric U-test for statistical significance, the same as Go's benchmark
  33. comparison tools, based on the large body of evidence that benchmarks rarely if
  34. ever tend to adhere to a normal or other known distribution. A non-parametric
  35. statistical model instead provides a much more realistic basis for comparing two
  36. measurements.
  37. The reported metrics themselves are also classified into "speed" vs. "cost"
  38. metrics in order to model whether larger is an improvement or a regression.
  39. The script uses `uv` to run it rather than Python directly, which manages and
  40. caches its dependencies. For installation instructions for `uv` see:
  41. - Carbon's documentation:
  42. https://docs.carbon-lang.dev/docs/project/contribution_tools.html#optional-tools
  43. - UV's documentation: https://docs.astral.sh/uv/getting-started/installation/
  44. """
  45. from __future__ import annotations
  46. __copyright__ = """
  47. Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  48. Exceptions. See /LICENSE for license information.
  49. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  50. """
  51. import argparse
  52. import json
  53. import math
  54. import numpy as np # type: ignore
  55. import re
  56. import scipy as sp # type: ignore
  57. import subprocess
  58. import sys
  59. from collections import defaultdict
  60. from dataclasses import dataclass, field
  61. from enum import Enum
  62. from pathlib import Path
  63. from quantiphy import Quantity # type: ignore
  64. from rich.console import Console
  65. from rich.padding import Padding
  66. from rich.progress import track
  67. from rich.table import Column, Table
  68. from rich.text import Text
  69. from rich.theme import Theme
  70. from typing import Optional
  71. def parse_args(args: Optional[list[str]] = None) -> argparse.Namespace:
  72. """Parsers command-line arguments and flags."""
  73. parser = argparse.ArgumentParser(description=__doc__)
  74. parser.add_argument(
  75. "--exp_benchmark",
  76. metavar="BINARY",
  77. required=True,
  78. type=Path,
  79. help="The experiment benchmark binary to run",
  80. )
  81. parser.add_argument(
  82. "--base_benchmark",
  83. metavar="BINARY",
  84. type=Path,
  85. help="""
  86. The baseline benchmark binary to run.
  87. Passing this flag will enable both a baseline and experiment, and change the
  88. analysis to compute and display any statistically significant delta as well
  89. as the before and after values of the each benchmark run.
  90. """.strip(),
  91. )
  92. parser.add_argument(
  93. "--benchmark_args",
  94. action="append",
  95. default=[],
  96. metavar="ARG",
  97. help="Extra arguments to both the experiment and baseline benchmark",
  98. )
  99. parser.add_argument(
  100. "--exp_benchmark_args",
  101. action="append",
  102. default=[],
  103. metavar="ARG",
  104. help="Extra arguments to the experiment benchmark",
  105. )
  106. parser.add_argument(
  107. "--base_benchmark_args",
  108. action="append",
  109. default=[],
  110. metavar="ARG",
  111. help="Extra arguments to the baseline benchmark",
  112. )
  113. parser.add_argument(
  114. "--benchmark_comparable_re",
  115. metavar="PATTERN",
  116. action="append",
  117. default=[],
  118. help="""
  119. A regular expression that is used to match sets of benchmarks that should be
  120. compared with each other. This flag may be specified multiple times with
  121. different regular expressions to handle multiple different grouping schemes or
  122. structures. May not be combined with `base_benchmark`.
  123. Each regular expression is used to group together benchmark names distinguished
  124. by a "tag" substring in the name. Either the regex as a whole or a `tag`
  125. symbolic capture group within the regex designates this substring. Further, a
  126. `main` symbolic capture group _must_ be included and only match when the
  127. specific substring is the main benchmark name and other matching ones should be
  128. viewed as comparisons against it. When rendering, only the name matching the
  129. main capture group will be rendered, with others rendered as comparisons against
  130. it based on the tag, and with statistical significance to evaluate the
  131. comparison.
  132. Example regex: `(?P<tag>(?P<main>Carbon)|Abseil|LLVM)HashBench`
  133. This produces three tags, `Carbon`, `Abseil`, and `LLVM`. The main tag is
  134. `Carbon`.
  135. TODO: This is only currently supported without a base benchmark to provide
  136. relative comparisons within a single benchmark binary. There are good models for
  137. handling this and surfacing delta-of-delta information with a base benchmark
  138. binary.
  139. """.strip(),
  140. )
  141. parser.add_argument(
  142. "--runs",
  143. default=5,
  144. metavar="N",
  145. type=int,
  146. help="Number of runs of the benchmark",
  147. )
  148. parser.add_argument(
  149. "--wall_time",
  150. action="store_true",
  151. help="Use wall-clock time instead of CPU time",
  152. )
  153. parser.add_argument(
  154. "--show_iterations",
  155. action="store_true",
  156. help="Show the iteration counts",
  157. )
  158. parser.add_argument(
  159. "--extra_metrics_filter",
  160. metavar="PATTERN",
  161. type=str,
  162. help="A regex filter on the names of extra metrics to display.",
  163. )
  164. parser.add_argument(
  165. "--alpha",
  166. default=0.05,
  167. metavar="𝛂",
  168. type=float,
  169. help="""
  170. Threshold for P-values to be considered statistically significant. Also used to
  171. compute the confidence intervals for individual metrics.
  172. """.strip(),
  173. )
  174. parser.add_argument(
  175. "--output",
  176. choices=["console", "json"],
  177. default="console",
  178. help="""
  179. Output format to use, note that `json` output doesn't do any analysis of the
  180. results, and just dumps the aggregate JSON data from the repeated runs.
  181. """.strip(),
  182. )
  183. return parser.parse_args(args=args)
  184. # Pre-compiled regexes to match metrics that measure _speed_: larger is better.
  185. SPEED_METRIC_PATTERNS = [
  186. re.compile(p)
  187. for p in [
  188. r"(?i)rate",
  189. r"(?i).*per[\s_](second|ms|ns)",
  190. ]
  191. ]
  192. # Pre-compiled regexes to match metrics that measure _cost_: smaller is better.
  193. COST_METRIC_PATTERNS = [
  194. re.compile(p)
  195. for p in [
  196. r"(?i)cycles",
  197. r"(?i)instructions",
  198. r"(?i)time",
  199. ]
  200. ]
  201. # Theme for use with the Rich `Console` printing.
  202. THEME = Theme(
  203. {
  204. "base_median": "cyan",
  205. "exp_median": "magenta",
  206. "base_conf": "cyan",
  207. "exp_conf": "magenta",
  208. "slower": "bright_red",
  209. "faster": "bright_green",
  210. }
  211. )
  212. # The set of benchmark keys we ignore in the JSON data structure. Most of these
  213. # are things are incidental, but a few are more surprising. See comments on
  214. # specific entries for details.
  215. IGNORED_BENCHMARK_KEYS = set(
  216. [
  217. "name",
  218. "family_index",
  219. "per_family_instance_index",
  220. "run_name",
  221. "run_type",
  222. "repetitions",
  223. "repetition_index",
  224. "threads",
  225. # We don't render `iterations` because we instead directly compute
  226. # statistical error bars using the multiple iterations. This removes the
  227. # need for manually considering the iteration count.
  228. "iterations",
  229. # We ignore the time and time unit metrics here because we directly
  230. # access and special case these metrics in order to apply the unit to
  231. # the times.
  232. "real_time",
  233. "cpu_time",
  234. "time_unit",
  235. ]
  236. )
  237. class DeltaKind(Enum):
  238. """Models the relevant kinds of deltas that we end up wanting to render."""
  239. IMPROVEMENT = "[faster]👍[/faster]"
  240. NEUTRAL = "~"
  241. REGRESSION = "[slower]👎[/slower]"
  242. NOISE = ""
  243. def __str__(self) -> str:
  244. return self.value
  245. @dataclass
  246. class RenderedDelta:
  247. """Rendered delta and pvalue for some metric."""
  248. kind: DeltaKind
  249. delta: str
  250. pvalue: str
  251. @dataclass
  252. class RenderedMetric:
  253. """Rendered non-delta metric and its confidence interval."""
  254. median: str
  255. conf: str
  256. @dataclass
  257. class BenchmarkRunMetrics:
  258. """The main data class used to collect metrics for benchmark runs.
  259. The data is read in using a JSON format that isn't organized in a convenient
  260. way to analyze and render, so we re-organize it into this data class and use
  261. that for analysis.
  262. Each object of this class corresponds to a specific named benchmark.
  263. """
  264. # The main metrics for this named benchmark, or the "experiment". This field
  265. # is always populated.
  266. exp: list[Quantity] = field(default_factory=lambda: [])
  267. # The metrics for this named benchmark in the base execution. May be empty
  268. # if no base execution was provided to compute a delta against.
  269. base: list[Quantity] = field(default_factory=lambda: [])
  270. # Any comparable benchmark metrics, indexed by the tag name to use when
  271. # rendering the comparison. May be empty if there are no comparable
  272. # benchmarks for the main one this represents.
  273. comps: defaultdict[str, list[Quantity]] = field(
  274. default_factory=lambda: defaultdict(list)
  275. )
  276. @dataclass
  277. class ComparableBenchmarkMapping:
  278. """Organizes any comparable benchmarks.
  279. Constructed with the list of benchmark names and regexes that describe
  280. comparable name structures.
  281. Names that match one of these regexes are organized into the main name in
  282. `main_benchmark_names`, and the comparable names in various mappings to
  283. allow computing comparisons metrics between the main and comparable names.
  284. Names that don't match any of the regexes are just directly included in
  285. `main_benchmark_names`.
  286. """
  287. # Names that are considered "main" benchmarks after filtering.
  288. main_benchmark_names: list[str]
  289. # Maps a comparison benchmark name to its base name (tag removed).
  290. name_to_base: dict[str, str]
  291. # Maps a base name to its main benchmark name.
  292. base_to_main_name: dict[str, str]
  293. # Maps a comparison benchmark name to its tag.
  294. name_to_comp_tag: dict[str, str]
  295. # Maps a main benchmark name to a list of its comparison tags.
  296. main_name_to_comp_tags: dict[str, list[str]]
  297. def __init__(
  298. self,
  299. original_benchmark_names: list[str],
  300. comparable_re_strs: list[str],
  301. console: Console,
  302. ):
  303. """Identify main and comparable benchmarks."""
  304. self.main_benchmark_names = []
  305. self.name_to_base = {}
  306. self.base_to_main_name = {}
  307. self.name_to_comp_tag = {}
  308. self.main_name_to_comp_tags = {}
  309. comp_res = [
  310. re.compile(comparable_re_str)
  311. for comparable_re_str in comparable_re_strs
  312. ]
  313. for comp_re in comp_res:
  314. if "main" not in comp_re.groupindex:
  315. console.log(
  316. "ERROR: No main capture group in the "
  317. "`--benchmark_comparable_re` flag!"
  318. )
  319. sys.exit(1)
  320. for name in original_benchmark_names:
  321. comp_match = next(
  322. (m for comp_re in comp_res if (m := comp_re.search(name))), None
  323. )
  324. if not comp_match:
  325. # Non-comparable benchmark
  326. self.main_benchmark_names.append(name)
  327. continue
  328. tag_group = 0
  329. if "tag" in comp_match.re.groupindex:
  330. tag_group = comp_match.re.groupindex["tag"]
  331. tag = comp_match.group(tag_group)
  332. tag_begin, tag_end = comp_match.span(tag_group)
  333. base_name = name[:tag_begin] + name[tag_end:]
  334. self.name_to_base[name] = base_name
  335. if comp_match.group("main"):
  336. self.base_to_main_name[base_name] = name
  337. self.main_benchmark_names.append(name)
  338. else:
  339. self.name_to_comp_tag[name] = tag
  340. # Verify that for all the comparable benchmarks we actually found a main
  341. # benchmark name. We can't do this while processing initially as we
  342. # don't know the relative order of main and comparable benchmark names.
  343. #
  344. # Also collect a list of all the comparison tags for a given main name.
  345. # self.main_name_to_comp_tags: dict[str, list[str]] = {}
  346. for comp, comp_tag in self.name_to_comp_tag.items():
  347. base_name = self.name_to_base[comp]
  348. main_name = self.base_to_main_name[base_name]
  349. if not main_name:
  350. console.log(
  351. f"ERROR: Comparable benchmark `{comp}` has no corresponding"
  352. " main benchmark name!"
  353. )
  354. sys.exit(1)
  355. if comp_tag in self.main_name_to_comp_tags.get(main_name, []):
  356. console.log(
  357. f"ERROR: Duplicate comparison tag `{comp_tag}` for main "
  358. f"benchmark `{main_name}`!"
  359. )
  360. sys.exit(1)
  361. self.main_name_to_comp_tags.setdefault(main_name, []).append(
  362. comp_tag
  363. )
  364. def float_ratio(nom: float, denom: float) -> float:
  365. """Translate a ratio of floats into a float, handling divide by zero."""
  366. if denom != 0.0:
  367. return nom / denom
  368. elif nom > 0.0:
  369. return math.inf
  370. elif nom < 0.0:
  371. return -math.inf
  372. else:
  373. return 0.0
  374. def render_fixed_width_float(x: float) -> str:
  375. """Renders a floating point value into a fixed width string."""
  376. if math.isinf(x):
  377. return f"{x:>4f}{'':<3}"
  378. (frac, whole) = math.modf(x)
  379. frac_str = f"{math.fabs(frac):<4.3f}"[1:]
  380. return f"{int(whole):> 3}{frac_str}"
  381. def render_ratio(ratio: float) -> str:
  382. """Renders a ratio into a human-friendly string form.
  383. This uses a % for ratios with a magnitude less than 1.0. For ratios with a
  384. larger magnitude, they are rendered as a fixed width floating point number
  385. with an `x` suffix.
  386. """
  387. if ratio > 1.0 or ratio < -1.0:
  388. return f"{render_fixed_width_float(ratio)}x"
  389. else:
  390. return f"{render_fixed_width_float(ratio * 100.0)}%"
  391. def render_metric(
  392. alpha: float, times: list[Quantity], is_base: bool
  393. ) -> RenderedMetric:
  394. """Render a non-delta metric.
  395. Computes the string to use for both the metric itself and the string to show
  396. the confidence interval for that metric.
  397. Args:
  398. alpha: The alpha value to use for the confidence interval.
  399. times: The list of measurements.
  400. is_base:
  401. Whether to use the "baseline" or "experiment" theme in the rendered
  402. strings.
  403. """
  404. if is_base:
  405. style_prefix = "base_"
  406. else:
  407. style_prefix = "exp_"
  408. units = times[0].units
  409. if all(x == times[0] for x in times):
  410. with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
  411. return RenderedMetric(
  412. f"[{style_prefix}median]{times[0]:.3}[/{style_prefix}median]",
  413. "",
  414. )
  415. median = Quantity(np.median(times), units=units)
  416. median_test = sp.stats.quantile_test(times, q=median)
  417. median_ci = median_test.confidence_interval(confidence_level=(1.0 - alpha))
  418. ci_str = "?"
  419. if not math.isnan(median_ci.low) and not math.isnan(median_ci.high):
  420. low_delta = median - median_ci.low
  421. high_delta = median_ci.high - median
  422. assert low_delta >= 0.0, high_delta >= 0.0
  423. delta = max(low_delta, high_delta)
  424. ci_str = render_ratio(float_ratio(delta, median))
  425. with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
  426. return RenderedMetric(
  427. f"[{style_prefix}median]{median:.3}[/{style_prefix}median]",
  428. f"[{style_prefix}conf]{ci_str:9}[/{style_prefix}conf]",
  429. )
  430. def render_delta(
  431. metric: str, alpha: float, base: list[Quantity], exp: list[Quantity]
  432. ) -> RenderedDelta:
  433. """Render a delta metric.
  434. This handles computing the delta, its statistical significance, and
  435. whether that delta is an improvement or a regression based on the specific
  436. metric name.
  437. Args:
  438. metric:
  439. The name of the metric to guide whether bigger or smaller is an
  440. improvement.
  441. alpha: The alpha value to use for the confidence interval.
  442. base: The baseline measurements.
  443. exp: The experiment measurements.
  444. """
  445. # Skip any delta when all the data is zero. This typically occurs for
  446. # uninteresting metrics or metrics that weren't collected for a given run.
  447. if all(b == 0 for b in base) and all(e == 0 for e in exp):
  448. return RenderedDelta(DeltaKind.NEUTRAL, "", "")
  449. if any(speed_pat.search(metric) for speed_pat in SPEED_METRIC_PATTERNS):
  450. bigger_style = "faster"
  451. smaller_style = "slower"
  452. bigger_kind = DeltaKind.IMPROVEMENT
  453. smaller_kind = DeltaKind.REGRESSION
  454. elif any(cost_pat.search(metric) for cost_pat in COST_METRIC_PATTERNS):
  455. bigger_style = "slower"
  456. smaller_style = "faster"
  457. bigger_kind = DeltaKind.REGRESSION
  458. smaller_kind = DeltaKind.IMPROVEMENT
  459. else:
  460. return RenderedDelta(DeltaKind.NEUTRAL, "", "")
  461. u_test = sp.stats.mannwhitneyu(base, exp)
  462. if u_test.pvalue >= alpha:
  463. return RenderedDelta(
  464. DeltaKind.NOISE, " ?? ", f"p={u_test.pvalue:.3}"
  465. )
  466. kind = DeltaKind.NEUTRAL
  467. base_median = np.median(base)
  468. exp_median = np.median(exp)
  469. exp_ratio = float_ratio(exp_median, base_median)
  470. # TODO: Maybe the threshold of "interesting" should be configurable instead
  471. # of being fixed at 0.1%.
  472. if exp_ratio >= 1.001:
  473. style = bigger_style
  474. kind = bigger_kind
  475. elif exp_ratio <= 0.999:
  476. style = smaller_style
  477. kind = smaller_kind
  478. else:
  479. style = "default"
  480. if exp_ratio >= 2.0 or exp_ratio <= 0.5:
  481. return RenderedDelta(
  482. kind,
  483. f"[{style}]{render_fixed_width_float(exp_ratio)}x[/{style}]",
  484. f"p={u_test.pvalue:.3}",
  485. )
  486. # Use a percent-delta for smaller ratios to make the delta more easily
  487. # understood by readers.
  488. exp_delta_percent = (
  489. float_ratio(exp_median - base_median, base_median) * 100.0
  490. )
  491. return RenderedDelta(
  492. kind,
  493. f"[{style}]{render_fixed_width_float(exp_delta_percent)}%[/{style}]",
  494. f"p={u_test.pvalue:.3}",
  495. )
  496. def render_metric_column(
  497. metric: str,
  498. alpha: float,
  499. runs: list[BenchmarkRunMetrics],
  500. ) -> Table:
  501. """Render the column of the benchmark results table for a given metric.
  502. We render a single column for each metric, and use a careful line-oriented
  503. layout within the column to ensure "rows" line up for each individual
  504. benchmark. Within the column, we use a nested table to layout the different
  505. rendered strings.
  506. A key goal of the rendering throughout is to arrange for rendered numbers to
  507. have the decimal point in a consistent column so that it isn't confusing for
  508. readers to identify the position of the decimal point and magnitude of the
  509. number rendered.
  510. Args:
  511. metric: The name of the metric to render.
  512. alpha: The alpha value to use for the confidence interval.
  513. runs: The list of benchmark runs.
  514. """
  515. t = Table.grid(
  516. Column(),
  517. # It might seem like we want the left column here to be right-aligned,
  518. # but we're going to carefully align the digits in the format string,
  519. # and we can't easily control the length of units. So we left-align to
  520. # simplify the digit layout.
  521. Column(justify="left"),
  522. Column(justify="center"),
  523. Column(justify="left"),
  524. padding=(0, 1),
  525. )
  526. for run in runs:
  527. if len(run.base) != 0:
  528. # We have a baseline run to compare against, so compute the delta
  529. # between it and the experiment as well as the specific baseline run
  530. # metric.
  531. rendered_delta = render_delta(metric, alpha, run.base, run.exp)
  532. rendered_base = render_metric(alpha, run.base, is_base=True)
  533. # Add the delta as the first row, then the baseline metric.
  534. t.add_row(
  535. str(rendered_delta.kind),
  536. rendered_delta.delta,
  537. "",
  538. rendered_delta.pvalue,
  539. )
  540. t.add_row("", rendered_base.median, "±", rendered_base.conf)
  541. # Now render the experiment metric and add its row.
  542. rendered_exp = render_metric(alpha, run.exp, is_base=False)
  543. t.add_row("", rendered_exp.median, "±", rendered_exp.conf)
  544. # If we have any comparable benchmarks, render each of them as first a
  545. # delta and then the specific comparable metric as its own kind of
  546. # baseline.
  547. #
  548. # TODO: At some point when we support combining baseline _runs_ with
  549. # comparable metrics, we'll need to change this to render both baseline
  550. # and experiment comparables and a delta-of-delta. But currently we
  551. # don't support combining these which simplifies the rendering here.
  552. for name, comp in sorted(run.comps.items()):
  553. rendered_delta = render_delta(metric, alpha, comp, run.exp)
  554. t.add_row(
  555. str(rendered_delta.kind),
  556. rendered_delta.delta,
  557. "",
  558. rendered_delta.pvalue,
  559. )
  560. rendered_comp = render_metric(alpha, comp, is_base=True)
  561. t.add_row("", rendered_comp.median, "±", rendered_comp.conf)
  562. # Lastly, if we had a baseline run or any comparable metrics we will
  563. # have rendered multiple lines of data. Add a blank line so that these
  564. # form a visual group.
  565. if len(run.base) != 0 or len(run.comps) != 0:
  566. t.add_row()
  567. return t
  568. def run_benchmark_binary(
  569. binary_path: Path,
  570. common_args: list[str],
  571. specific_args: list[str],
  572. num_runs: int,
  573. console: Console,
  574. ) -> list[dict]:
  575. """Runs a benchmark binary multiple times and collects results.
  576. The results are parsed out of the JSON output from each run, and returned as
  577. a list of dictionaries. Each dictionary represents one run.
  578. This will log the command being run, show a progress bar for each run
  579. performed, and then log de-duplicated `stderr` output from the runs.
  580. """
  581. # If the binary path has no directory components and exists as a relative
  582. # file, add `./` as a prefix. Otherwise, we want to pass the name unchanged
  583. # for `PATH` search.
  584. binary_str = str(binary_path)
  585. if len(binary_path.parts) == 1 and binary_path.exists():
  586. binary_str = f"./{binary_str}"
  587. run_cmd = (
  588. [
  589. binary_str,
  590. "--benchmark_format=json",
  591. ]
  592. + common_args
  593. + specific_args
  594. )
  595. console.log(f"Executing: {' '.join(run_cmd)}")
  596. runs_data = []
  597. unique_stderr: list[bytes] = []
  598. for _ in track(
  599. range(num_runs), description=f"Running {binary_path.name}..."
  600. ):
  601. p = subprocess.run(
  602. run_cmd,
  603. check=True,
  604. stdout=subprocess.PIPE,
  605. stderr=subprocess.PIPE,
  606. )
  607. runs_data.append(json.loads(p.stdout))
  608. stderr = p.stderr.strip()
  609. if len(stderr) != 0 and stderr not in unique_stderr:
  610. unique_stderr.append(stderr)
  611. for stderr_output in unique_stderr:
  612. # Decode stderr, replacing errors in case of non-UTF-8 characters.
  613. console.log(
  614. f"{binary_path.name} stderr:\n"
  615. f"{stderr_output.decode('utf-8', errors='replace')}"
  616. )
  617. return runs_data
  618. def print_run_context(
  619. console: Console,
  620. num_runs: int,
  621. exp_runs: list[dict],
  622. has_baseline: bool,
  623. ) -> None:
  624. """Prints the context from the benchmark runs.
  625. This replicates the useful context information from Google Benchmark's
  626. default output, such as CPU information and cache sizes.
  627. TODO: Print differently when context of base and experiment runs differ.
  628. Args:
  629. console: The rich console to print to.
  630. num_runs: The number of times the benchmarks were run.
  631. exp_runs: The results from the experiment benchmark runs.
  632. has_baseline: Whether a baseline benchmark was also run.
  633. """
  634. if has_baseline:
  635. runs_description = f"Ran baseline and experiment {num_runs} times"
  636. else:
  637. runs_description = f"Ran {num_runs} times"
  638. context = exp_runs[0]["context"]
  639. console.print(
  640. f"{runs_description} on "
  641. f"{context['num_cpus']} x {context['mhz_per_cpu']} MHz CPUs"
  642. )
  643. console.print("CPU caches:")
  644. for cache in context["caches"]:
  645. size = Quantity(cache["size"], binary=True)
  646. console.print(f" L{cache['level']} {cache['type']} {size:b}")
  647. console.print(
  648. f"Load avg: {' '.join([str(load) for load in context['load_avg']])}"
  649. )
  650. def get_benchmark_names_and_metrics(
  651. parsed_args: argparse.Namespace,
  652. exp_runs: list[dict],
  653. base_runs: list[dict],
  654. ) -> tuple[list[str], list[str]]:
  655. """Extracts benchmark names and metrics from benchmark run results.
  656. This function determines the list of unique benchmark names and the metrics
  657. to be displayed based on the benchmark output and command-line arguments.
  658. Args:
  659. parsed_args: The parsed command-line arguments.
  660. exp_runs: A list of benchmark run results for the experiment binary.
  661. base_runs: A list of benchmark run results for the baseline binary.
  662. Returns:
  663. - The list of unique benchmark names, maintaining their order.
  664. - The list of metrics to display.
  665. """
  666. metrics: list[str] = []
  667. benchmark_names: list[str] = []
  668. # Start with the base time and iteration metrics requested.
  669. if parsed_args.wall_time:
  670. metrics.append("real_time")
  671. else:
  672. metrics.append("cpu_time")
  673. if parsed_args.show_iterations:
  674. metrics.append("iterations")
  675. # Compile a regex for filtering extra metrics, if provided.
  676. if metrics_filter_str := parsed_args.extra_metrics_filter:
  677. metrics_filter = re.compile(metrics_filter_str)
  678. else:
  679. metrics_filter = None
  680. # We only need to inspect the first run to find all benchmark and metric
  681. # names. We combine benchmarks from both experiment and baseline runs to get
  682. # a complete set.
  683. one_run_benchmarks = exp_runs[0]["benchmarks"]
  684. if parsed_args.base_benchmark:
  685. one_run_benchmarks += base_runs[0]["benchmarks"]
  686. for benchmark in one_run_benchmarks:
  687. name = benchmark["name"]
  688. # Add the benchmark name if we haven't seen it before to get a unique
  689. # list that preserves the order of appearance.
  690. if name not in benchmark_names:
  691. benchmark_names.append(name)
  692. # Add any extra metrics from this benchmark.
  693. for key in benchmark.keys():
  694. if key in metrics or key in IGNORED_BENCHMARK_KEYS:
  695. continue
  696. if metrics_filter and not re.search(metrics_filter, key):
  697. continue
  698. metrics.append(key)
  699. return benchmark_names, metrics
  700. def collect_benchmark_metrics(
  701. benchmark_names: list[str],
  702. metrics: list[str],
  703. exp_runs: list[dict],
  704. base_runs: list[dict],
  705. comp_mapping: ComparableBenchmarkMapping,
  706. ) -> dict[str, dict[str, BenchmarkRunMetrics]]:
  707. """Collects and organizes all benchmark metrics from raw run data.
  708. This function takes the raw benchmark run data and organizes it into a
  709. structured format suitable for analysis and rendering. It initializes the
  710. main data structure, handles the mapping of comparable benchmarks, and
  711. populates the metrics for both experiment and baseline runs.
  712. Args:
  713. benchmark_names: The initial list of unique benchmark names.
  714. metrics: A list of all metric names to be collected.
  715. exp_runs: A list of benchmark run results for the experiment binary.
  716. base_runs: A list of benchmark run results for the baseline binary.
  717. comp_mapping: The mapping of comparable benchmarks.
  718. Returns:
  719. A dictionary where keys are metric names. The values are another
  720. dictionary where keys are benchmark names and values are
  721. BenchmarkRunMetrics objects containing the collected measurements.
  722. """
  723. # Initialize the data structure to hold all collected metrics.
  724. benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]] = {
  725. metric: {name: BenchmarkRunMetrics() for name in benchmark_names}
  726. for metric in metrics
  727. }
  728. # Populate metrics from the experiment runs.
  729. for run in exp_runs:
  730. for b in run["benchmarks"]:
  731. name = b["name"]
  732. for metric in metrics:
  733. # Time metrics have a `time_unit` field that needs to be
  734. # appended for correct parsing by the Quantity library.
  735. unit = b.get("time_unit", "") if "time" in metric else ""
  736. # If this is a comparable benchmark, add its metrics to the
  737. # 'comps' list of its corresponding main benchmark.
  738. if maybe_comp_tag := comp_mapping.name_to_comp_tag.get(name):
  739. main_name = comp_mapping.base_to_main_name[
  740. comp_mapping.name_to_base[name]
  741. ]
  742. benchmark_metrics[metric][main_name].comps[
  743. maybe_comp_tag
  744. ].append(Quantity(f"{b[metric]}{unit}"))
  745. # Otherwise, add it to the 'exp' list of its own entry if it's
  746. # a main benchmark.
  747. elif name in benchmark_names:
  748. benchmark_metrics[metric][name].exp.append(
  749. Quantity(f"{b[metric]}{unit}")
  750. )
  751. # Populate metrics from the baseline runs.
  752. for run in base_runs:
  753. for b in run["benchmarks"]:
  754. name = b["name"]
  755. # Baseline runs don't have comparable benchmarks, so we only need
  756. # to populate the 'base' list for main benchmarks.
  757. if name in benchmark_names:
  758. for metric in metrics:
  759. unit = b.get("time_unit", "") if "time" in metric else ""
  760. benchmark_metrics[metric][name].base.append(
  761. Quantity(f"{b[metric]}{unit}")
  762. )
  763. return benchmark_metrics
  764. def print_metric_key(
  765. console: Console,
  766. alpha: float,
  767. has_baseline: bool,
  768. comp_mapping: ComparableBenchmarkMapping,
  769. ) -> None:
  770. """Prints a legend for the metrics table.
  771. This explains the format of the output table, including what the delta,
  772. median, and confidence interval values represent.
  773. Args:
  774. console: The rich console to print to.
  775. alpha: The alpha value for statistical significance.
  776. has_baseline: Whether a baseline benchmark was run.
  777. """
  778. console.print("Metric key:")
  779. conf = int(100 * (1.0 - alpha))
  780. name = "BenchmarkName..."
  781. delta_icon = str(DeltaKind.IMPROVEMENT)
  782. delta = "[faster]<delta>[/faster]"
  783. p = "p=<U-test P-value>"
  784. base_median = "[base_median]<median>[/base_median]"
  785. base_conf = f"[base_conf]<% at {conf}th conf>[/base_conf]"
  786. exp_median = "[exp_median]<median>[/exp_median]"
  787. exp_conf = f"[exp_conf]<% at {conf}th conf>[/exp_conf]"
  788. key_table = Table.grid(
  789. Column(justify="right"),
  790. Column(),
  791. Column(),
  792. Column(),
  793. Column(),
  794. padding=(0, 1),
  795. )
  796. if has_baseline:
  797. key_table.add_row(name, delta_icon, delta, "", p)
  798. key_table.add_row("baseline:", "", base_median, "±", base_conf)
  799. key_table.add_row("experiment:", "", exp_median, "±", exp_conf)
  800. else:
  801. key_table.add_row(name, "", exp_median, "±", exp_conf)
  802. # Only display comparable key if we have comparables to display.
  803. if bool(comp_mapping.name_to_comp_tag):
  804. key_table.add_row("vs Comparable:", delta_icon, delta, p)
  805. key_table.add_row("", "", base_median, "±", base_conf)
  806. console.print(Padding(key_table, (0, 0, 1, 3)))
  807. def print_results_table(
  808. console: Console,
  809. alpha: float,
  810. has_baseline: bool,
  811. metrics: list[str],
  812. benchmark_names: list[str],
  813. benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]],
  814. comp_mapping: ComparableBenchmarkMapping,
  815. ) -> None:
  816. """Builds and prints the main results table.
  817. This function constructs a rich `Table` to display the benchmark results,
  818. including deltas, medians, and confidence intervals for each metric. It then
  819. prints this to the provided `console`.
  820. Args:
  821. console: The rich console to print to.
  822. metrics: A list of metric names to be displayed as columns.
  823. benchmark_names: A list of main benchmark names for the rows.
  824. alpha: The alpha value for statistical significance.
  825. benchmark_metrics: A nested dictionary containing the collected metrics
  826. for each benchmark and metric.
  827. has_baseline: Whether a baseline benchmark was run.
  828. comp_mapping: The mapping of comparable benchmarks.
  829. """
  830. METRIC_TITLES = {
  831. "real_time": "Wall Time",
  832. "cpu_time": "CPU Time",
  833. "iterations": "Iterations",
  834. }
  835. name_width = max(
  836. (
  837. len(name)
  838. for name in (
  839. benchmark_names
  840. + [
  841. f"vs {tag}:"
  842. for tag in comp_mapping.name_to_comp_tag.values()
  843. ]
  844. + ["experiment:"]
  845. )
  846. )
  847. )
  848. table = Table(show_edge=False)
  849. # The benchmark name column we want to justify right for the sub-labels, but
  850. # we will fill the name in the column completed and the name will visually
  851. # be justified to the left, so force the heading to justify left unlike the
  852. # column text. We also disable wrapping because we manually fill the column
  853. # and require line-precise layout.
  854. table.add_column(
  855. Text("Benchmark", justify="left"), justify="right", no_wrap=True
  856. )
  857. for metric in metrics:
  858. title = Text(METRIC_TITLES.get(metric, metric), justify="center")
  859. table.add_column(title, justify="left", no_wrap=True)
  860. name_t = Table.grid(Column(justify="right", no_wrap=True), expand=True)
  861. for name in benchmark_names:
  862. name_t.add_row(f"{name}{'.' * (name_width - len(name))}")
  863. if has_baseline:
  864. name_t.add_row("baseline:")
  865. name_t.add_row("experiment:")
  866. name_t.add_row()
  867. elif comp_tags := comp_mapping.main_name_to_comp_tags.get(name):
  868. for tag in comp_tags:
  869. name_t.add_row(f"vs {tag}:")
  870. name_t.add_row()
  871. name_t.add_row()
  872. row = [name_t]
  873. for metric in metrics:
  874. metric_runs = benchmark_metrics[metric]
  875. row.append(
  876. render_metric_column(
  877. metric, alpha, [metric_runs[name] for name in benchmark_names]
  878. )
  879. )
  880. table.add_row(*row)
  881. console.print(table)
  882. def main() -> None:
  883. parsed_args = parse_args()
  884. console = Console(theme=THEME)
  885. Quantity.set_prefs(spacer=" ", map_sf=Quantity.map_sf_to_greek)
  886. if parsed_args.base_benchmark and parsed_args.benchmark_comparable_re:
  887. console.print(
  888. "ERROR: Cannot mix a base benchmark binary with benchmark "
  889. "comparisons."
  890. )
  891. sys.exit(1)
  892. # Run the benchmark(s) and collect the results into a data structure for
  893. # processing.
  894. num_runs = parsed_args.runs
  895. base_runs: list[dict] = []
  896. has_baseline = bool(parsed_args.base_benchmark)
  897. if has_baseline:
  898. base_runs = run_benchmark_binary(
  899. parsed_args.base_benchmark,
  900. parsed_args.benchmark_args,
  901. parsed_args.base_benchmark_args,
  902. num_runs,
  903. console,
  904. )
  905. exp_runs = run_benchmark_binary(
  906. parsed_args.exp_benchmark,
  907. parsed_args.benchmark_args,
  908. parsed_args.exp_benchmark_args,
  909. num_runs,
  910. console,
  911. )
  912. # If JSON output is requested, just dump the data without further
  913. # processing.
  914. if parsed_args.output == "json":
  915. console.log("Printing JSON results...")
  916. console.print_json(json.dumps(exp_runs))
  917. if has_baseline:
  918. console.print_json(json.dumps(base_runs))
  919. return
  920. print_run_context(console, num_runs, exp_runs, has_baseline)
  921. # Collect the benchmark names and metric names.
  922. benchmark_names, metrics = get_benchmark_names_and_metrics(
  923. parsed_args, exp_runs, base_runs
  924. )
  925. # Build any mappings between main benchmark names and comparables, and reset
  926. # our benchmark names to the main ones.
  927. comp_mapping = ComparableBenchmarkMapping(
  928. benchmark_names, parsed_args.benchmark_comparable_re, console
  929. )
  930. benchmark_names = comp_mapping.main_benchmark_names
  931. # Collect and organize the actual benchmark metrics from the raw JSON
  932. # structures across the runs. This pivots the data into an easy to analyze
  933. # and render structure, but doesn't do the analysis itself.
  934. benchmark_metrics = collect_benchmark_metrics(
  935. benchmark_names, metrics, exp_runs, base_runs, comp_mapping
  936. )
  937. # Analyze and render a readable table of the collected metrics. This is
  938. # where we do statistical analysis and render confidence intervals,
  939. # significance, and other helpful indicators based on the collected data. We
  940. # also print relevant keys to reading and interpreting the rendered data.
  941. alpha = parsed_args.alpha
  942. console.print(
  943. "Computing statistically significant deltas only where"
  944. f"the P-value < 𝛂 of {alpha}"
  945. )
  946. print_metric_key(console, alpha, has_baseline, comp_mapping)
  947. print_results_table(
  948. console,
  949. alpha,
  950. has_baseline,
  951. metrics,
  952. benchmark_names,
  953. benchmark_metrics,
  954. comp_mapping,
  955. )
  956. if __name__ == "__main__":
  957. main()