Bitcoin Core  24.99.0
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1 // __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2 // | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3 // | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4 //
5 // Microbenchmark framework for C++11/14/17/20
6 // https://github.com/martinus/nanobench
7 //
8 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9 // SPDX-License-Identifier: MIT
10 // Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining a copy
13 // of this software and associated documentation files (the "Software"), to deal
14 // in the Software without restriction, including without limitation the rights
15 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 // copies of the Software, and to permit persons to whom the Software is
17 // furnished to do so, subject to the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be included in all
20 // copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 // SOFTWARE.
29 
30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
32 
33 // see https://semver.org/
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 10 // backwards-compatible bug fixes
37 
39 // public facing api - as minimal as possible
41 
42 #include <chrono> // high_resolution_clock
43 #include <cstring> // memcpy
44 #include <iosfwd> // for std::ostream* custom output target in Config
45 #include <string> // all names
46 #include <unordered_map> // holds context information of results
47 #include <vector> // holds all results
48 
49 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
50 
51 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
52 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
55 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
56 
57 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
58 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
59 #else
60 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
61 #endif
62 
63 #if defined(__clang__)
64 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
65  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
66 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
67 #else
68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
69 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
70 #endif
71 
72 #if defined(__GNUC__)
73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
74 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
75 #else
76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
77 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
78 #endif
79 
80 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
81 # include <iostream>
82 # define ANKERL_NANOBENCH_LOG(x) \
83  do { \
84  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
85  } while (0)
86 #else
87 # define ANKERL_NANOBENCH_LOG(x) \
88  do { \
89  } while (0)
90 #endif
91 
92 #define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
93 #if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
94 # include <linux/version.h>
95 # if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
96 // PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
97 // PERF_FLAG_FD_CLOEXEC since kernel 3.14
98 # undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
99 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
100 # endif
101 #endif
102 
103 #if defined(__clang__)
104 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
105 #else
106 # define ANKERL_NANOBENCH_NO_SANITIZE(...)
107 #endif
108 
109 #if defined(_MSC_VER)
110 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
111 #else
112 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
113 #endif
114 
115 // workaround missing "is_trivially_copyable" in g++ < 5.0
116 // See https://stackoverflow.com/a/31798726/48181
117 #if defined(__GNUC__) && __GNUC__ < 5
118 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
119 #else
120 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
121 #endif
122 
123 // declarations ///////////////////////////////////////////////////////////////////////////////////
124 
125 namespace ankerl {
126 namespace nanobench {
127 
128 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
129  std::chrono::steady_clock>::type;
130 class Bench;
131 struct Config;
132 class Result;
133 class Rng;
134 class BigO;
135 
286 void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
287 void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
288 
297 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
298 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
299 
300 // Contains mustache-like templates
301 namespace templates {
302 
312 char const* csv() noexcept;
313 
324 char const* htmlBoxplot() noexcept;
325 
332 char const* pyperf() noexcept;
333 
343 char const* json() noexcept;
344 
345 } // namespace templates
346 
347 namespace detail {
348 
349 template <typename T>
350 struct PerfCountSet;
351 
352 class IterationLogic;
353 class PerformanceCounters;
354 
355 #if ANKERL_NANOBENCH(PERF_COUNTERS)
356 class LinuxPerformanceCounters;
357 #endif
358 
359 } // namespace detail
360 } // namespace nanobench
361 } // namespace ankerl
362 
363 // definitions ////////////////////////////////////////////////////////////////////////////////////
364 
365 namespace ankerl {
366 namespace nanobench {
367 namespace detail {
368 
369 template <typename T>
370 struct PerfCountSet {
377 };
378 
379 } // namespace detail
380 
381 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
382 struct Config {
383  // actual benchmark config
384  std::string mBenchmarkTitle = "benchmark"; // NOLINT(misc-non-private-member-variables-in-classes)
385  std::string mBenchmarkName = "noname"; // NOLINT(misc-non-private-member-variables-in-classes)
386  std::string mUnit = "op"; // NOLINT(misc-non-private-member-variables-in-classes)
387  double mBatch = 1.0; // NOLINT(misc-non-private-member-variables-in-classes)
388  double mComplexityN = -1.0; // NOLINT(misc-non-private-member-variables-in-classes)
389  size_t mNumEpochs = 11; // NOLINT(misc-non-private-member-variables-in-classes)
390  size_t mClockResolutionMultiple = static_cast<size_t>(1000); // NOLINT(misc-non-private-member-variables-in-classes)
391  std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)
392  std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); // NOLINT(misc-non-private-member-variables-in-classes)
393  uint64_t mMinEpochIterations{1}; // NOLINT(misc-non-private-member-variables-in-classes)
394  // If not 0, run *exactly* these number of iterations per epoch.
395  uint64_t mEpochIterations{0}; // NOLINT(misc-non-private-member-variables-in-classes)
396  uint64_t mWarmup = 0; // NOLINT(misc-non-private-member-variables-in-classes)
397  std::ostream* mOut = nullptr; // NOLINT(misc-non-private-member-variables-in-classes)
398  std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)
399  std::string mTimeUnitName = "ns"; // NOLINT(misc-non-private-member-variables-in-classes)
400  bool mShowPerformanceCounters = true; // NOLINT(misc-non-private-member-variables-in-classes)
401  bool mIsRelative = false; // NOLINT(misc-non-private-member-variables-in-classes)
402  std::unordered_map<std::string, std::string> mContext{}; // NOLINT(misc-non-private-member-variables-in-classes)
403 
406  Config& operator=(Config const& other);
407  Config& operator=(Config&& other) noexcept;
408  Config(Config const& other);
409  Config(Config&& other) noexcept;
410 };
411 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
412 
413 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
414 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
415 class Result {
416 public:
417  enum class Measure : size_t {
418  elapsed,
419  iterations,
420  pagefaults,
421  cpucycles,
422  contextswitches,
423  instructions,
424  branchinstructions,
425  branchmisses,
426  _size
427  };
428 
429  explicit Result(Config benchmarkConfig);
430 
432  Result& operator=(Result const& other);
433  Result& operator=(Result&& other) noexcept;
434  Result(Result const& other);
435  Result(Result&& other) noexcept;
436 
437  // adds new measurement results
438  // all values are scaled by iters (except iters...)
439  void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
440 
441  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
442 
443  ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
444  ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
445  ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
446  ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
447  ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
448  ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
449  ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
450  ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;
451  ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;
452 
453  ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
454  ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
455  ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
456  ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
457 
458  // Finds string, if not found, returns _size.
459  static Measure fromString(std::string const& str);
460 
461 private:
462  Config mConfig{};
463  std::vector<std::vector<double>> mNameToMeasurements{};
464 };
465 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
466 
467 
484 class Rng final {
485 public:
489  using result_type = uint64_t;
490 
491  static constexpr uint64_t(min)();
492  static constexpr uint64_t(max)();
493 
499  Rng(Rng const&) = delete;
500 
504  Rng& operator=(Rng const&) = delete;
505 
506  // moving is ok
507  Rng(Rng&&) noexcept = default;
508  Rng& operator=(Rng&&) noexcept = default;
509  ~Rng() noexcept = default;
510 
518  Rng();
519 
536  explicit Rng(uint64_t seed) noexcept;
537  Rng(uint64_t x, uint64_t y) noexcept;
538  explicit Rng(std::vector<uint64_t> const& data);
539 
543  ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
544 
552  inline uint64_t operator()() noexcept;
553 
554  // This is slightly biased. See
555 
570  inline uint32_t bounded(uint32_t range) noexcept;
571 
572  // random double in range [0, 1(
573  // see http://prng.di.unimi.it/
574 
581  inline double uniform01() noexcept;
582 
590  template <typename Container>
591  void shuffle(Container& container) noexcept;
592 
599  std::vector<uint64_t> state() const;
600 
601 private:
602  static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
603 
604  uint64_t mX;
605  uint64_t mY;
606 };
607 
622 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
623 class Bench {
624 public:
628  Bench();
629 
630  Bench(Bench&& other) noexcept;
631  Bench& operator=(Bench&& other) noexcept;
632  Bench(Bench const& other);
633  Bench& operator=(Bench const& other);
634  ~Bench() noexcept;
635 
654  template <typename Op>
655  ANKERL_NANOBENCH(NOINLINE)
656  Bench& run(char const* benchmarkName, Op&& op);
657 
658  template <typename Op>
659  ANKERL_NANOBENCH(NOINLINE)
660  Bench& run(std::string const& benchmarkName, Op&& op);
661 
666  template <typename Op>
667  ANKERL_NANOBENCH(NOINLINE)
668  Bench& run(Op&& op);
669 
675  Bench& title(char const* benchmarkTitle);
676  Bench& title(std::string const& benchmarkTitle);
677 
681  ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
682 
684  Bench& name(char const* benchmarkName);
685  Bench& name(std::string const& benchmarkName);
686  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
687 
700  Bench& context(char const* variableName, char const* variableValue);
701  Bench& context(std::string const& variableName, std::string const& variableValue);
702 
711  Bench& clearContext();
712 
722  template <typename T>
723  Bench& batch(T b) noexcept;
724  ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
725 
734  Bench& unit(char const* unit);
735  Bench& unit(std::string const& unit);
736  ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
737 
747  Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
748  ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
749  ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
750 
758  Bench& output(std::ostream* outstream) noexcept;
759  ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
760 
781  Bench& clockResolutionMultiple(size_t multiple) noexcept;
782  ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
783 
799  Bench& epochs(size_t numEpochs) noexcept;
800  ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
801 
812  Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
813  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
814 
825  Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
826  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
827 
838  Bench& minEpochIterations(uint64_t numIters) noexcept;
839  ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
840 
847  Bench& epochIterations(uint64_t numIters) noexcept;
848  ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
849 
859  Bench& warmup(uint64_t numWarmupIters) noexcept;
860  ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
861 
879  Bench& relative(bool isRelativeEnabled) noexcept;
880  ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
881 
890  Bench& performanceCounters(bool showPerformanceCounters) noexcept;
891  ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
892 
901  ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
902 
910  template <typename Arg>
912 
927  template <typename T>
928  Bench& complexityN(T n) noexcept;
929  ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
930 
962  std::vector<BigO> complexityBigO() const;
963 
987  template <typename Op>
988  BigO complexityBigO(char const* name, Op op) const;
989 
990  template <typename Op>
991  BigO complexityBigO(std::string const& name, Op op) const;
992 
1000  Bench& render(char const* templateContent, std::ostream& os);
1001  Bench& render(std::string const& templateContent, std::ostream& os);
1002 
1003  Bench& config(Config const& benchmarkConfig);
1004  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
1005 
1006 private:
1007  Config mConfig{};
1008  std::vector<Result> mResults{};
1009 };
1010 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1011 
1012 
1018 template <typename Arg>
1019 void doNotOptimizeAway(Arg&& arg);
1020 
1021 namespace detail {
1022 
1023 #if defined(_MSC_VER)
1024 void doNotOptimizeAwaySink(void const*);
1025 
1026 template <typename T>
1027 void doNotOptimizeAway(T const& val);
1028 
1029 #else
1030 
1031 // These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
1032 // this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
1033 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1034 template <typename T>
1035 void doNotOptimizeAway(T const& val) {
1036  // NOLINTNEXTLINE(hicpp-no-assembler)
1037  asm volatile("" : : "r,m"(val) : "memory");
1038 }
1039 
1040 template <typename T>
1041 void doNotOptimizeAway(T& val) {
1042 # if defined(__clang__)
1043  // NOLINTNEXTLINE(hicpp-no-assembler)
1044  asm volatile("" : "+r,m"(val) : : "memory");
1045 # else
1046  // NOLINTNEXTLINE(hicpp-no-assembler)
1047  asm volatile("" : "+m,r"(val) : : "memory");
1048 # endif
1049 }
1050 #endif
1051 
1052 // internally used, but visible because run() is templated.
1053 // Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1054 // having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1055 ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1057 public:
1058  explicit IterationLogic(Bench const& bench);
1061  IterationLogic(IterationLogic const&) = delete;
1064 
1065  ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1066  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1067  void moveResultTo(std::vector<Result>& results) noexcept;
1068 
1069 private:
1070  struct Impl;
1071  Impl* mPimpl;
1072 };
1073 ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1074 
1075 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1077 public:
1082 
1085 
1087  void endMeasure();
1088  void updateResults(uint64_t numIters);
1089 
1090  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1091  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1092 
1093 private:
1094 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1095  LinuxPerformanceCounters* mPc = nullptr;
1096 #endif
1099 };
1100 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1101 
1102 // Gets the singleton
1104 
1105 } // namespace detail
1106 
1107 class BigO {
1108 public:
1109  using RangeMeasure = std::vector<std::pair<double, double>>;
1110 
1111  template <typename Op>
1113  for (auto& rangeMeasure : data) {
1114  rangeMeasure.first = op(rangeMeasure.first);
1115  }
1116  return data;
1117  }
1118 
1119  static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1120 
1121  template <typename Op>
1122  BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1123  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1124 
1125  template <typename Op>
1126  BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1127  : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
1128 
1129  BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1130  BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);
1131  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1132  ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1133  ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1134  ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1135 
1136 private:
1137  std::string mName{};
1138  double mConstant{};
1139  double mNormalizedRootMeanSquare{};
1140 };
1141 std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1142 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1143 
1144 } // namespace nanobench
1145 } // namespace ankerl
1146 
1147 // implementation /////////////////////////////////////////////////////////////////////////////////
1148 
1149 namespace ankerl {
1150 namespace nanobench {
1151 
1152 constexpr uint64_t(Rng::min)() {
1153  return 0;
1154 }
1155 
1156 constexpr uint64_t(Rng::max)() {
1157  return (std::numeric_limits<uint64_t>::max)();
1158 }
1159 
1160 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1161 uint64_t Rng::operator()() noexcept {
1162  auto x = mX;
1163 
1164  mX = UINT64_C(15241094284759029579) * mY;
1165  mY = rotl(mY - x, 27);
1166 
1167  return x;
1168 }
1169 
1170 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1171 uint32_t Rng::bounded(uint32_t range) noexcept {
1172  uint64_t const r32 = static_cast<uint32_t>(operator()());
1173  auto multiresult = r32 * range;
1174  return static_cast<uint32_t>(multiresult >> 32U);
1175 }
1176 
1177 double Rng::uniform01() noexcept {
1178  auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1179  // can't use union in c++ here for type puning, it's undefined behavior.
1180  // std::memcpy is optimized anyways.
1181  double d{};
1182  std::memcpy(&d, &i, sizeof(double));
1183  return d - 1.0;
1184 }
1185 
1186 template <typename Container>
1187 void Rng::shuffle(Container& container) noexcept {
1188  auto i = container.size();
1189  while (i > 1U) {
1190  using std::swap;
1191  auto n = operator()();
1192  // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
1193  auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
1194  swap(container[--i], container[b1]);
1195 
1196  auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
1197  swap(container[--i], container[b2]);
1198  }
1199 }
1200 
1201 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1202 constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1203  return (x << k) | (x >> (64U - k));
1204 }
1205 
1206 template <typename Op>
1208 Bench& Bench::run(Op&& op) {
1209  // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1210  detail::IterationLogic iterationLogic(*this);
1211  auto& pc = detail::performanceCounters();
1212 
1213  while (auto n = iterationLogic.numIters()) {
1214  pc.beginMeasure();
1215  Clock::time_point const before = Clock::now();
1216  while (n-- > 0) {
1217  op();
1218  }
1219  Clock::time_point const after = Clock::now();
1220  pc.endMeasure();
1221  pc.updateResults(iterationLogic.numIters());
1222  iterationLogic.add(after - before, pc);
1223  }
1224  iterationLogic.moveResultTo(mResults);
1225  return *this;
1226 }
1227 
1228 // Performs all evaluations.
1229 template <typename Op>
1230 Bench& Bench::run(char const* benchmarkName, Op&& op) {
1231  name(benchmarkName);
1232  return run(std::forward<Op>(op));
1233 }
1234 
1235 template <typename Op>
1236 Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1237  name(benchmarkName);
1238  return run(std::forward<Op>(op));
1239 }
1240 
1241 template <typename Op>
1242 BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1243  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1244 }
1245 
1246 template <typename Op>
1247 BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1248  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1249 }
1250 
1251 // Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1252 // Any argument is cast to double.
1253 template <typename T>
1254 Bench& Bench::batch(T b) noexcept {
1255  mConfig.mBatch = static_cast<double>(b);
1256  return *this;
1257 }
1258 
1259 // Sets the computation complexity of the next run. Any argument is cast to double.
1260 template <typename T>
1261 Bench& Bench::complexityN(T n) noexcept {
1262  mConfig.mComplexityN = static_cast<double>(n);
1263  return *this;
1264 }
1265 
1266 // Convenience: makes sure none of the given arguments are optimized away by the compiler.
1267 template <typename Arg>
1269  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1270  return *this;
1271 }
1272 
1273 // Makes sure none of the given arguments are optimized away by the compiler.
1274 template <typename Arg>
1275 void doNotOptimizeAway(Arg&& arg) {
1276  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1277 }
1278 
1279 namespace detail {
1280 
1281 #if defined(_MSC_VER)
1282 template <typename T>
1283 void doNotOptimizeAway(T const& val) {
1284  doNotOptimizeAwaySink(&val);
1285 }
1286 
1287 #endif
1288 
1289 } // namespace detail
1290 } // namespace nanobench
1291 } // namespace ankerl
1292 
1293 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1294 
1296 // implementation part - only visible in .cpp
1298 
1299 # include <algorithm> // sort, reverse
1300 # include <atomic> // compare_exchange_strong in loop overhead
1301 # include <cstdlib> // getenv
1302 # include <cstring> // strstr, strncmp
1303 # include <fstream> // ifstream to parse proc files
1304 # include <iomanip> // setw, setprecision
1305 # include <iostream> // cout
1306 # include <numeric> // accumulate
1307 # include <random> // random_device
1308 # include <sstream> // to_s in Number
1309 # include <stdexcept> // throw for rendering templates
1310 # include <tuple> // std::tie
1311 # if defined(__linux__)
1312 # include <unistd.h> //sysconf
1313 # endif
1314 # if ANKERL_NANOBENCH(PERF_COUNTERS)
1315 # include <map> // map
1316 
1317 # include <linux/perf_event.h>
1318 # include <sys/ioctl.h>
1319 # include <sys/syscall.h>
1320 # endif
1321 
1322 // declarations ///////////////////////////////////////////////////////////////////////////////////
1323 
1324 namespace ankerl {
1325 namespace nanobench {
1326 
1327 // helper stuff that is only intended to be used internally
1328 namespace detail {
1329 
1330 struct TableInfo;
1331 
1332 // formatting utilities
1333 namespace fmt {
1334 
1335 class NumSep;
1336 class StreamStateRestorer;
1337 class Number;
1338 class MarkDownColumn;
1339 class MarkDownCode;
1340 
1341 } // namespace fmt
1342 } // namespace detail
1343 } // namespace nanobench
1344 } // namespace ankerl
1345 
1346 // definitions ////////////////////////////////////////////////////////////////////////////////////
1347 
1348 namespace ankerl {
1349 namespace nanobench {
1350 
1351 uint64_t splitMix64(uint64_t& state) noexcept;
1352 
1353 namespace detail {
1354 
1355 // helpers to get double values
1356 template <typename T>
1357 inline double d(T t) noexcept {
1358  return static_cast<double>(t);
1359 }
1360 inline double d(Clock::duration duration) noexcept {
1361  return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1362 }
1363 
1364 // Calculates clock resolution once, and remembers the result
1365 inline Clock::duration clockResolution() noexcept;
1366 
1367 } // namespace detail
1368 
1369 namespace templates {
1370 
1371 char const* csv() noexcept {
1372  return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1373 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1374 {{/result}})DELIM";
1375 }
1376 
1377 char const* htmlBoxplot() noexcept {
1378  return R"DELIM(<html>
1379 
1380 <head>
1381  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1382 </head>
1383 
1384 <body>
1385  <div id="myDiv"></div>
1386  <script>
1387  var data = [
1388  {{#result}}{
1389  name: '{{name}}',
1390  y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1391  },
1392  {{/result}}
1393  ];
1394  var title = '{{title}}';
1395 
1396  data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1397  var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1398  </script>
1399 </body>
1400 
1401 </html>)DELIM";
1402 }
1403 
1404 char const* pyperf() noexcept {
1405  return R"DELIM({
1406  "benchmarks": [
1407  {
1408  "runs": [
1409  {
1410  "values": [
1411 {{#measurement}} {{elapsed}}{{^-last}},
1412 {{/last}}{{/measurement}}
1413  ]
1414  }
1415  ]
1416  }
1417  ],
1418  "metadata": {
1419  "loops": {{sum(iterations)}},
1420  "inner_loops": {{batch}},
1421  "name": "{{title}}",
1422  "unit": "second"
1423  },
1424  "version": "1.0"
1425 })DELIM";
1426 }
1427 
1428 char const* json() noexcept {
1429  return R"DELIM({
1430  "results": [
1431 {{#result}} {
1432  "title": "{{title}}",
1433  "name": "{{name}}",
1434  "unit": "{{unit}}",
1435  "batch": {{batch}},
1436  "complexityN": {{complexityN}},
1437  "epochs": {{epochs}},
1438  "clockResolution": {{clockResolution}},
1439  "clockResolutionMultiple": {{clockResolutionMultiple}},
1440  "maxEpochTime": {{maxEpochTime}},
1441  "minEpochTime": {{minEpochTime}},
1442  "minEpochIterations": {{minEpochIterations}},
1443  "epochIterations": {{epochIterations}},
1444  "warmup": {{warmup}},
1445  "relative": {{relative}},
1446  "median(elapsed)": {{median(elapsed)}},
1447  "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1448  "median(instructions)": {{median(instructions)}},
1449  "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1450  "median(cpucycles)": {{median(cpucycles)}},
1451  "median(contextswitches)": {{median(contextswitches)}},
1452  "median(pagefaults)": {{median(pagefaults)}},
1453  "median(branchinstructions)": {{median(branchinstructions)}},
1454  "median(branchmisses)": {{median(branchmisses)}},
1455  "totalTime": {{sumProduct(iterations, elapsed)}},
1456  "measurements": [
1457 {{#measurement}} {
1458  "iterations": {{iterations}},
1459  "elapsed": {{elapsed}},
1460  "pagefaults": {{pagefaults}},
1461  "cpucycles": {{cpucycles}},
1462  "contextswitches": {{contextswitches}},
1463  "instructions": {{instructions}},
1464  "branchinstructions": {{branchinstructions}},
1465  "branchmisses": {{branchmisses}}
1466  }{{^-last}},{{/-last}}
1467 {{/measurement}} ]
1468  }{{^-last}},{{/-last}}
1469 {{/result}} ]
1470 })DELIM";
1471 }
1472 
1473 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1474 struct Node {
1475  enum class Type { tag, content, section, inverted_section };
1476 
1477  char const* begin;
1478  char const* end;
1479  std::vector<Node> children;
1480  Type type;
1481 
1482  template <size_t N>
1483  // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1484  bool operator==(char const (&str)[N]) const noexcept {
1485  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
1486  return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1487  }
1488 };
1489 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1490 
1491 // NOLINTNEXTLINE(misc-no-recursion)
1492 static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1493  std::vector<Node> nodes;
1494 
1495  while (true) {
1496  auto const* begin = std::strstr(*tpl, "{{");
1497  auto const* end = begin;
1498  if (begin != nullptr) {
1499  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1500  begin += 2;
1501  end = std::strstr(begin, "}}");
1502  }
1503 
1504  if (begin == nullptr || end == nullptr) {
1505  // nothing found, finish node
1506  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1507  nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1508  return nodes;
1509  }
1510 
1511  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1512  nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1513 
1514  // we found a tag
1515  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1516  *tpl = end + 2;
1517  switch (*begin) {
1518  case '/':
1519  // finished! bail out
1520  return nodes;
1521 
1522  case '#':
1523  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1524  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1525  break;
1526 
1527  case '^':
1528  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1529  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1530  break;
1531 
1532  default:
1533  nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1534  break;
1535  }
1536  }
1537 }
1538 
1539 static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1540  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1541  bool const matchFirst = n == "-first";
1542  bool const matchLast = n == "-last";
1543  if (!matchFirst && !matchLast) {
1544  return false;
1545  }
1546 
1547  bool doWrite = false;
1548  if (n.type == Node::Type::section) {
1549  doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1550  } else if (n.type == Node::Type::inverted_section) {
1551  doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1552  }
1553 
1554  if (doWrite) {
1555  for (auto const& child : n.children) {
1556  if (child.type == Node::Type::content) {
1557  out.write(child.begin, std::distance(child.begin, child.end));
1558  }
1559  }
1560  }
1561  return true;
1562 }
1563 
1564 static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1565  matchResult.clear();
1566  auto idxOpen = str.find('(');
1567  auto idxClose = str.find(')', idxOpen);
1568  if (idxClose == std::string::npos) {
1569  return false;
1570  }
1571 
1572  matchResult.emplace_back(str.substr(0, idxOpen));
1573 
1574  // split by comma
1575  matchResult.emplace_back();
1576  for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1577  if (str[i] == ' ' || str[i] == '\t') {
1578  // skip whitespace
1579  continue;
1580  }
1581  if (str[i] == ',') {
1582  // got a comma => new string
1583  matchResult.emplace_back();
1584  continue;
1585  }
1586  // no whitespace no comma, append
1587  matchResult.back() += str[i];
1588  }
1589  return true;
1590 }
1591 
1592 static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1593  using detail::d;
1594 
1595  if (n == "title") {
1596  out << config.mBenchmarkTitle;
1597  return true;
1598  }
1599  if (n == "name") {
1600  out << config.mBenchmarkName;
1601  return true;
1602  }
1603  if (n == "unit") {
1604  out << config.mUnit;
1605  return true;
1606  }
1607  if (n == "batch") {
1608  out << config.mBatch;
1609  return true;
1610  }
1611  if (n == "complexityN") {
1612  out << config.mComplexityN;
1613  return true;
1614  }
1615  if (n == "epochs") {
1616  out << config.mNumEpochs;
1617  return true;
1618  }
1619  if (n == "clockResolution") {
1620  out << d(detail::clockResolution());
1621  return true;
1622  }
1623  if (n == "clockResolutionMultiple") {
1624  out << config.mClockResolutionMultiple;
1625  return true;
1626  }
1627  if (n == "maxEpochTime") {
1628  out << d(config.mMaxEpochTime);
1629  return true;
1630  }
1631  if (n == "minEpochTime") {
1632  out << d(config.mMinEpochTime);
1633  return true;
1634  }
1635  if (n == "minEpochIterations") {
1636  out << config.mMinEpochIterations;
1637  return true;
1638  }
1639  if (n == "epochIterations") {
1640  out << config.mEpochIterations;
1641  return true;
1642  }
1643  if (n == "warmup") {
1644  out << config.mWarmup;
1645  return true;
1646  }
1647  if (n == "relative") {
1648  out << config.mIsRelative;
1649  return true;
1650  }
1651  return false;
1652 }
1653 
1654 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
1655 static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1656  if (generateConfigTag(n, r.config(), out)) {
1657  return out;
1658  }
1659  // match e.g. "median(elapsed)"
1660  // g++ 4.8 doesn't implement std::regex :(
1661  // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1662  // std::cmatch matchResult;
1663  // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1664  std::vector<std::string> matchResult;
1665  if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1666  if (matchResult.size() == 2) {
1667  if (matchResult[0] == "context") {
1668  return out << r.context(matchResult[1]);
1669  }
1670 
1671  auto m = Result::fromString(matchResult[1]);
1672  if (m == Result::Measure::_size) {
1673  return out << 0.0;
1674  }
1675 
1676  if (matchResult[0] == "median") {
1677  return out << r.median(m);
1678  }
1679  if (matchResult[0] == "average") {
1680  return out << r.average(m);
1681  }
1682  if (matchResult[0] == "medianAbsolutePercentError") {
1683  return out << r.medianAbsolutePercentError(m);
1684  }
1685  if (matchResult[0] == "sum") {
1686  return out << r.sum(m);
1687  }
1688  if (matchResult[0] == "minimum") {
1689  return out << r.minimum(m);
1690  }
1691  if (matchResult[0] == "maximum") {
1692  return out << r.maximum(m);
1693  }
1694  } else if (matchResult.size() == 3) {
1695  auto m1 = Result::fromString(matchResult[1]);
1696  auto m2 = Result::fromString(matchResult[2]);
1697  if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
1698  return out << 0.0;
1699  }
1700 
1701  if (matchResult[0] == "sumProduct") {
1702  return out << r.sumProduct(m1, m2);
1703  }
1704  }
1705  }
1706 
1707  // match e.g. "sumProduct(elapsed, iterations)"
1708  // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1709 
1710  // nothing matches :(
1711  throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1712 }
1713 
1714 static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1715  for (auto const& n : nodes) {
1716  if (!generateFirstLast(n, idx, r.size(), out)) {
1717  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1718  switch (n.type) {
1719  case Node::Type::content:
1720  out.write(n.begin, std::distance(n.begin, n.end));
1721  break;
1722 
1723  case Node::Type::inverted_section:
1724  throw std::runtime_error("got a inverted section inside measurement");
1725 
1726  case Node::Type::section:
1727  throw std::runtime_error("got a section inside measurement");
1728 
1729  case Node::Type::tag: {
1730  auto m = Result::fromString(std::string(n.begin, n.end));
1731  if (m == Result::Measure::_size || !r.has(m)) {
1732  out << 0.0;
1733  } else {
1734  out << r.get(idx, m);
1735  }
1736  break;
1737  }
1738  }
1739  }
1740  }
1741 }
1742 
1743 static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1744  auto const& r = results[idx];
1745  for (auto const& n : nodes) {
1746  if (!generateFirstLast(n, idx, results.size(), out)) {
1747  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1748  switch (n.type) {
1749  case Node::Type::content:
1750  out.write(n.begin, std::distance(n.begin, n.end));
1751  break;
1752 
1753  case Node::Type::inverted_section:
1754  throw std::runtime_error("got a inverted section inside result");
1755 
1756  case Node::Type::section:
1757  if (n == "measurement") {
1758  for (size_t i = 0; i < r.size(); ++i) {
1759  generateResultMeasurement(n.children, i, r, out);
1760  }
1761  } else {
1762  throw std::runtime_error("got a section inside result");
1763  }
1764  break;
1765 
1766  case Node::Type::tag:
1767  generateResultTag(n, r, out);
1768  break;
1769  }
1770  }
1771  }
1772 }
1773 
1774 } // namespace templates
1775 
1776 // helper stuff that only intended to be used internally
1777 namespace detail {
1778 
1779 char const* getEnv(char const* name);
1780 bool isEndlessRunning(std::string const& name);
1781 bool isWarningsEnabled();
1782 
1783 template <typename T>
1784 T parseFile(std::string const& filename);
1785 
1786 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1787 void printStabilityInformationOnce(std::ostream* outStream);
1788 
1789 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1790 uint64_t& singletonHeaderHash() noexcept;
1791 
1792 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1793 Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1794 
1795 // formatting utilities
1796 namespace fmt {
1797 
1798 // adds thousands separator to numbers
1799 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1800 class NumSep : public std::numpunct<char> {
1801 public:
1802  explicit NumSep(char sep);
1803  char do_thousands_sep() const override;
1804  std::string do_grouping() const override;
1805 
1806 private:
1807  char mSep;
1808 };
1809 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1810 
1811 // RAII to save & restore a stream's state
1812 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1813 class StreamStateRestorer {
1814 public:
1815  explicit StreamStateRestorer(std::ostream& s);
1816  ~StreamStateRestorer();
1817 
1818  // sets back all stream info that we remembered at construction
1819  void restore();
1820 
1821  // don't allow copying / moving
1822  StreamStateRestorer(StreamStateRestorer const&) = delete;
1823  StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1824  StreamStateRestorer(StreamStateRestorer&&) = delete;
1825  StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1826 
1827 private:
1828  std::ostream& mStream;
1829  std::locale mLocale;
1830  std::streamsize const mPrecision;
1831  std::streamsize const mWidth;
1832  std::ostream::char_type const mFill;
1833  std::ostream::fmtflags const mFmtFlags;
1834 };
1835 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1836 
1837 // Number formatter
1838 class Number {
1839 public:
1840  Number(int width, int precision, double value);
1841  Number(int width, int precision, int64_t value);
1842  std::string to_s() const;
1843 
1844 private:
1845  friend std::ostream& operator<<(std::ostream& os, Number const& n);
1846  std::ostream& write(std::ostream& os) const;
1847 
1848  int mWidth;
1849  int mPrecision;
1850  double mValue;
1851 };
1852 
1853 // helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1854 std::string to_s(uint64_t n);
1855 
1856 std::ostream& operator<<(std::ostream& os, Number const& n);
1857 
1858 class MarkDownColumn {
1859 public:
1860  MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val);
1861  std::string title() const;
1862  std::string separator() const;
1863  std::string invalid() const;
1864  std::string value() const;
1865 
1866 private:
1867  int mWidth;
1868  int mPrecision;
1869  std::string mTitle;
1870  std::string mSuffix;
1871  double mValue;
1872 };
1873 
1874 // Formats any text as markdown code, escaping backticks.
1875 class MarkDownCode {
1876 public:
1877  explicit MarkDownCode(std::string const& what);
1878 
1879 private:
1880  friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1881  std::ostream& write(std::ostream& os) const;
1882 
1883  std::string mWhat{};
1884 };
1885 
1886 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1887 
1888 } // namespace fmt
1889 } // namespace detail
1890 } // namespace nanobench
1891 } // namespace ankerl
1892 
1893 // implementation /////////////////////////////////////////////////////////////////////////////////
1894 
1895 namespace ankerl {
1896 namespace nanobench {
1897 
1898 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
1899 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1900  detail::fmt::StreamStateRestorer const restorer(out);
1901 
1902  out.precision(std::numeric_limits<double>::digits10);
1903  auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1904 
1905  for (auto const& n : nodes) {
1906  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1907  switch (n.type) {
1908  case templates::Node::Type::content:
1909  out.write(n.begin, std::distance(n.begin, n.end));
1910  break;
1911 
1912  case templates::Node::Type::inverted_section:
1913  throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1914 
1915  case templates::Node::Type::section:
1916  if (n == "result") {
1917  const size_t nbResults = results.size();
1918  for (size_t i = 0; i < nbResults; ++i) {
1919  generateResult(n.children, i, results, out);
1920  }
1921  } else if (n == "measurement") {
1922  if (results.size() != 1) {
1923  throw std::runtime_error(
1924  "render: can only use section 'measurement' here if there is a single result, but there are " +
1925  detail::fmt::to_s(results.size()));
1926  }
1927  // when we only have a single result, we can immediately go into its measurement.
1928  auto const& r = results.front();
1929  for (size_t i = 0; i < r.size(); ++i) {
1930  generateResultMeasurement(n.children, i, r, out);
1931  }
1932  } else {
1933  throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1934  }
1935  break;
1936 
1937  case templates::Node::Type::tag:
1938  if (results.size() == 1) {
1939  // result & config are both supported there
1940  generateResultTag(n, results.front(), out);
1941  } else {
1942  // This just uses the last result's config.
1943  if (!generateConfigTag(n, results.back().config(), out)) {
1944  throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1945  }
1946  }
1947  break;
1948  }
1949  }
1950 }
1951 
1952 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1953  render(mustacheTemplate.c_str(), results, out);
1954 }
1955 
1956 void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1957  render(mustacheTemplate, bench.results(), out);
1958 }
1959 
1960 void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1961  render(mustacheTemplate.c_str(), bench.results(), out);
1962 }
1963 
1964 namespace detail {
1965 
1966 PerformanceCounters& performanceCounters() {
1967 # if defined(__clang__)
1968 # pragma clang diagnostic push
1969 # pragma clang diagnostic ignored "-Wexit-time-destructors"
1970 # endif
1971  static PerformanceCounters pc;
1972 # if defined(__clang__)
1973 # pragma clang diagnostic pop
1974 # endif
1975  return pc;
1976 }
1977 
1978 // Windows version of doNotOptimizeAway
1979 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1980 // see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1981 // see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1982 # if defined(_MSC_VER)
1983 # pragma optimize("", off)
1984 void doNotOptimizeAwaySink(void const*) {}
1985 # pragma optimize("", on)
1986 # endif
1987 
1988 template <typename T>
1989 T parseFile(std::string const& filename) {
1990  std::ifstream fin(filename); // NOLINT(misc-const-correctness)
1991  T num{};
1992  fin >> num;
1993  return num;
1994 }
1995 
1996 char const* getEnv(char const* name) {
1997 # if defined(_MSC_VER)
1998 # pragma warning(push)
1999 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
2000 # endif
2001  return std::getenv(name); // NOLINT(concurrency-mt-unsafe)
2002 # if defined(_MSC_VER)
2003 # pragma warning(pop)
2004 # endif
2005 }
2006 
2007 bool isEndlessRunning(std::string const& name) {
2008  auto const* const endless = getEnv("NANOBENCH_ENDLESS");
2009  return nullptr != endless && endless == name;
2010 }
2011 
2012 // True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
2013 bool isWarningsEnabled() {
2014  auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
2015  return nullptr == suppression || suppression == std::string("0");
2016 }
2017 
2018 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
2019  warnings.clear();
2020  recommendations.clear();
2021 
2022 # if defined(DEBUG)
2023  warnings.emplace_back("DEBUG defined");
2024  bool const recommendCheckFlags = true;
2025 # else
2026  bool const recommendCheckFlags = false;
2027 # endif
2028 
2029  bool recommendPyPerf = false;
2030 # if defined(__linux__)
2031  auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
2032  if (nprocs <= 0) {
2033  warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
2034  } else {
2035 
2036  // check frequency scaling
2037  for (long id = 0; id < nprocs; ++id) {
2038  auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
2039  auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
2040  auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
2041  auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
2042  if (minFreq != maxFreq) {
2043  auto minMHz = static_cast<double>(minFreq) / 1000.0;
2044  auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
2045  warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
2046  detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
2047  " MHz");
2048  recommendPyPerf = true;
2049  break;
2050  }
2051  }
2052 
2053  auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
2054  if ("performance" != currentGovernor) {
2055  warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
2056  recommendPyPerf = true;
2057  }
2058 
2059  if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
2060  warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
2061  recommendPyPerf = true;
2062  }
2063  }
2064 # endif
2065 
2066  if (recommendCheckFlags) {
2067  recommendations.emplace_back("Make sure you compile for Release");
2068  }
2069  if (recommendPyPerf) {
2070  recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2071  }
2072 }
2073 
2074 void printStabilityInformationOnce(std::ostream* outStream) {
2075  static bool shouldPrint = true;
2076  if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {
2077  auto& os = *outStream;
2078  shouldPrint = false;
2079  std::vector<std::string> warnings;
2080  std::vector<std::string> recommendations;
2081  gatherStabilityInformation(warnings, recommendations);
2082  if (warnings.empty()) {
2083  return;
2084  }
2085 
2086  os << "Warning, results might be unstable:" << std::endl;
2087  for (auto const& w : warnings) {
2088  os << "* " << w << std::endl;
2089  }
2090 
2091  os << std::endl << "Recommendations" << std::endl;
2092  for (auto const& r : recommendations) {
2093  os << "* " << r << std::endl;
2094  }
2095  }
2096 }
2097 
2098 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2099 uint64_t& singletonHeaderHash() noexcept {
2100  static uint64_t sHeaderHash{};
2101  return sHeaderHash;
2102 }
2103 
2104 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2105 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2106  return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2107 }
2108 
2109 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2110 Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2111  auto bestDuration = Clock::duration::max();
2112  Clock::time_point tBegin;
2113  Clock::time_point tEnd;
2114  for (size_t i = 0; i < numEvaluations; ++i) {
2115  tBegin = Clock::now();
2116  do {
2117  tEnd = Clock::now();
2118  } while (tBegin == tEnd);
2119  bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2120  }
2121  return bestDuration;
2122 }
2123 
2124 // Calculates clock resolution once, and remembers the result
2125 Clock::duration clockResolution() noexcept {
2126  static Clock::duration const sResolution = calcClockResolution(20);
2127  return sResolution;
2128 }
2129 
2130 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2131 struct IterationLogic::Impl {
2132  enum class State { warmup, upscaling_runtime, measuring, endless };
2133 
2134  explicit Impl(Bench const& bench)
2135  : mBench(bench)
2136  , mResult(bench.config()) {
2137  printStabilityInformationOnce(mBench.output());
2138 
2139  // determine target runtime per epoch
2140  mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2141  if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2142  mTargetRuntimePerEpoch = mBench.maxEpochTime();
2143  }
2144  if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2145  mTargetRuntimePerEpoch = mBench.minEpochTime();
2146  }
2147 
2148  if (isEndlessRunning(mBench.name())) {
2149  std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2150  mNumIters = (std::numeric_limits<uint64_t>::max)();
2151  mState = State::endless;
2152  } else if (0 != mBench.warmup()) {
2153  mNumIters = mBench.warmup();
2154  mState = State::warmup;
2155  } else if (0 != mBench.epochIterations()) {
2156  // exact number of iterations
2157  mNumIters = mBench.epochIterations();
2158  mState = State::measuring;
2159  } else {
2160  mNumIters = mBench.minEpochIterations();
2161  mState = State::upscaling_runtime;
2162  }
2163  }
2164 
2165  // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2166  ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2167  auto doubleElapsed = d(elapsed);
2168  auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2169  auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2170 
2171  auto doubleMinEpochIters = d(mBench.minEpochIterations());
2172  if (doubleNewIters < doubleMinEpochIters) {
2173  doubleNewIters = doubleMinEpochIters;
2174  }
2175  doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2176 
2177  // +0.5 for correct rounding when casting
2178  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2179  return static_cast<uint64_t>(doubleNewIters + 0.5);
2180  }
2181 
2182  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2183  if (elapsed * 10 < mTargetRuntimePerEpoch) {
2184  // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2185  if (mNumIters * 10 < mNumIters) {
2186  // overflow :-(
2187  showResult("iterations overflow. Maybe your code got optimized away?");
2188  mNumIters = 0;
2189  return;
2190  }
2191  mNumIters *= 10;
2192  } else {
2193  mNumIters = calcBestNumIters(elapsed, mNumIters);
2194  }
2195  }
2196 
2197  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2198 # if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2199  auto oldIters = mNumIters;
2200 # endif
2201 
2202  switch (mState) {
2203  case State::warmup:
2204  if (isCloseEnoughForMeasurements(elapsed)) {
2205  // if elapsed is close enough, we can skip upscaling and go right to measurements
2206  // still, we don't add the result to the measurements.
2207  mState = State::measuring;
2208  mNumIters = calcBestNumIters(elapsed, mNumIters);
2209  } else {
2210  // not close enough: switch to upscaling
2211  mState = State::upscaling_runtime;
2212  upscale(elapsed);
2213  }
2214  break;
2215 
2216  case State::upscaling_runtime:
2217  if (isCloseEnoughForMeasurements(elapsed)) {
2218  // if we are close enough, add measurement and switch to always measuring
2219  mState = State::measuring;
2220  mTotalElapsed += elapsed;
2221  mTotalNumIters += mNumIters;
2222  mResult.add(elapsed, mNumIters, pc);
2223  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2224  } else {
2225  upscale(elapsed);
2226  }
2227  break;
2228 
2229  case State::measuring:
2230  // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2231  // that fluctuation, or else we would bias the result
2232  mTotalElapsed += elapsed;
2233  mTotalNumIters += mNumIters;
2234  mResult.add(elapsed, mNumIters, pc);
2235  if (0 != mBench.epochIterations()) {
2236  mNumIters = mBench.epochIterations();
2237  } else {
2238  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2239  }
2240  break;
2241 
2242  case State::endless:
2243  mNumIters = (std::numeric_limits<uint64_t>::max)();
2244  break;
2245  }
2246 
2247  if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2248  // we got all the results that we need, finish it
2249  showResult("");
2250  mNumIters = 0;
2251  }
2252 
2253  ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2254  << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2255  << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2256  << ", mState=" << static_cast<int>(mState));
2257  }
2258 
2259  // NOLINTNEXTLINE(readability-function-cognitive-complexity)
2260  void showResult(std::string const& errorMessage) const {
2261  ANKERL_NANOBENCH_LOG(errorMessage);
2262 
2263  if (mBench.output() != nullptr) {
2264  // prepare column data ///////
2265  std::vector<fmt::MarkDownColumn> columns;
2266 
2267  auto rMedian = mResult.median(Result::Measure::elapsed);
2268 
2269  if (mBench.relative()) {
2270  double d = 100.0;
2271  if (!mBench.results().empty()) {
2272  d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2273  }
2274  columns.emplace_back(11, 1, "relative", "%", d);
2275  }
2276 
2277  if (mBench.complexityN() > 0) {
2278  columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2279  }
2280 
2281  columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2282  rMedian / (mBench.timeUnit().count() * mBench.batch()));
2283  columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2284 
2285  double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2286  columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2287 
2288  double rInsMedian = -1.0;
2289  if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2290  rInsMedian = mResult.median(Result::Measure::instructions);
2291  columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2292  }
2293 
2294  double rCycMedian = -1.0;
2295  if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2296  rCycMedian = mResult.median(Result::Measure::cpucycles);
2297  columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2298  }
2299  if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2300  columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2301  }
2302  if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2303  double const rBraMedian = mResult.median(Result::Measure::branchinstructions);
2304  columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2305  if (mResult.has(Result::Measure::branchmisses)) {
2306  double p = 0.0;
2307  if (rBraMedian >= 1e-9) {
2308  p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2309  }
2310  columns.emplace_back(10, 1, "miss%", "%", p);
2311  }
2312  }
2313 
2314  columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2315 
2316  // write everything
2317  auto& os = *mBench.output();
2318 
2319  // combine all elements that are relevant for printing the header
2320  uint64_t hash = 0;
2321  hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2322  hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2323  hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2324  hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2325  hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2326  hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2327 
2328  if (hash != singletonHeaderHash()) {
2329  singletonHeaderHash() = hash;
2330 
2331  // no result yet, print header
2332  os << std::endl;
2333  for (auto const& col : columns) {
2334  os << col.title();
2335  }
2336  os << "| " << mBench.title() << std::endl;
2337 
2338  for (auto const& col : columns) {
2339  os << col.separator();
2340  }
2341  os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2342  }
2343 
2344  if (!errorMessage.empty()) {
2345  for (auto const& col : columns) {
2346  os << col.invalid();
2347  }
2348  os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2349  } else {
2350  for (auto const& col : columns) {
2351  os << col.value();
2352  }
2353  os << "| ";
2354  auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2355  if (showUnstable) {
2356  os << ":wavy_dash: ";
2357  }
2358  os << fmt::MarkDownCode(mBench.name());
2359  if (showUnstable) {
2360  auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2361  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2362  auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2363 
2364  os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2365  << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2366  }
2367  os << std::endl;
2368  }
2369  }
2370  }
2371 
2372  ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2373  return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2374  }
2375 
2376  uint64_t mNumIters = 1; // NOLINT(misc-non-private-member-variables-in-classes)
2377  Bench const& mBench; // NOLINT(misc-non-private-member-variables-in-classes)
2378  std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)
2379  Result mResult; // NOLINT(misc-non-private-member-variables-in-classes)
2380  Rng mRng{123}; // NOLINT(misc-non-private-member-variables-in-classes)
2381  std::chrono::nanoseconds mTotalElapsed{}; // NOLINT(misc-non-private-member-variables-in-classes)
2382  uint64_t mTotalNumIters = 0; // NOLINT(misc-non-private-member-variables-in-classes)
2383  State mState = State::upscaling_runtime; // NOLINT(misc-non-private-member-variables-in-classes)
2384 };
2385 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2386 
2387 IterationLogic::IterationLogic(Bench const& bench)
2388  : mPimpl(new Impl(bench)) {}
2389 
2390 IterationLogic::~IterationLogic() {
2391  delete mPimpl;
2392 }
2393 
2394 uint64_t IterationLogic::numIters() const noexcept {
2395  ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2396  return mPimpl->mNumIters;
2397 }
2398 
2399 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2400  mPimpl->add(elapsed, pc);
2401 }
2402 
2403 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2404  results.emplace_back(std::move(mPimpl->mResult));
2405 }
2406 
2407 # if ANKERL_NANOBENCH(PERF_COUNTERS)
2408 
2409 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2410 class LinuxPerformanceCounters {
2411 public:
2412  struct Target {
2413  Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2414  : targetValue(targetValue_)
2415  , correctMeasuringOverhead(correctMeasuringOverhead_)
2416  , correctLoopOverhead(correctLoopOverhead_) {}
2417 
2418  uint64_t* targetValue{}; // NOLINT(misc-non-private-member-variables-in-classes)
2419  bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2420  bool correctLoopOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2421  };
2422 
2423  LinuxPerformanceCounters() = default;
2424  LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;
2425  LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;
2426  LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;
2427  LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;
2428  ~LinuxPerformanceCounters();
2429 
2430  // quick operation
2431  inline void start() {}
2432 
2433  inline void stop() {}
2434 
2435  bool monitor(perf_sw_ids swId, Target target);
2436  bool monitor(perf_hw_id hwId, Target target);
2437 
2438  bool hasError() const noexcept {
2439  return mHasError;
2440  }
2441 
2442  // Just reading data is faster than enable & disabling.
2443  // we subtract data ourselves.
2444  inline void beginMeasure() {
2445  if (mHasError) {
2446  return;
2447  }
2448 
2449  // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2450  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2451  if (mHasError) {
2452  return;
2453  }
2454 
2455  // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2456  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2457  }
2458 
2459  inline void endMeasure() {
2460  if (mHasError) {
2461  return;
2462  }
2463 
2464  // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2465  mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2466  if (mHasError) {
2467  return;
2468  }
2469 
2470  auto const numBytes = sizeof(uint64_t) * mCounters.size();
2471  auto ret = read(mFd, mCounters.data(), numBytes);
2472  mHasError = ret != static_cast<ssize_t>(numBytes);
2473  }
2474 
2475  void updateResults(uint64_t numIters);
2476 
2477  // rounded integer division
2478  template <typename T>
2479  static inline T divRounded(T a, T divisor) {
2480  return (a + divisor / 2) / divisor;
2481  }
2482 
2483  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2484  static inline uint32_t mix(uint32_t x) noexcept {
2485  x ^= x << 13U;
2486  x ^= x >> 17U;
2487  x ^= x << 5U;
2488  return x;
2489  }
2490 
2491  template <typename Op>
2492  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2493  void calibrate(Op&& op) {
2494  // clear current calibration data,
2495  for (auto& v : mCalibratedOverhead) {
2496  v = UINT64_C(0);
2497  }
2498 
2499  // create new calibration data
2500  auto newCalibration = mCalibratedOverhead;
2501  for (auto& v : newCalibration) {
2502  v = (std::numeric_limits<uint64_t>::max)();
2503  }
2504  for (size_t iter = 0; iter < 100; ++iter) {
2505  beginMeasure();
2506  op();
2507  endMeasure();
2508  if (mHasError) {
2509  return;
2510  }
2511 
2512  for (size_t i = 0; i < newCalibration.size(); ++i) {
2513  auto diff = mCounters[i];
2514  if (newCalibration[i] > diff) {
2515  newCalibration[i] = diff;
2516  }
2517  }
2518  }
2519 
2520  mCalibratedOverhead = std::move(newCalibration);
2521 
2522  {
2523  // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2524  // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2525  // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2526  // see https://godbolt.org/z/49RVQ5
2527  uint64_t const numIters = 100000U + (std::random_device{}() & 3U);
2528  uint64_t n = numIters;
2529  uint32_t x = 1234567;
2530 
2531  beginMeasure();
2532  while (n-- > 0) {
2533  x = mix(x);
2534  }
2535  endMeasure();
2537  auto measure1 = mCounters;
2538 
2539  n = numIters;
2540  beginMeasure();
2541  while (n-- > 0) {
2542  // we now run *twice* so we can easily calculate the overhead
2543  x = mix(x);
2544  x = mix(x);
2545  }
2546  endMeasure();
2548  auto measure2 = mCounters;
2549 
2550  for (size_t i = 0; i < mCounters.size(); ++i) {
2551  // factor 2 because we have two instructions per loop
2552  auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2553  auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2554  auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2555 
2556  mLoopOverhead[i] = divRounded(overhead, numIters);
2557  }
2558  }
2559  }
2560 
2561 private:
2562  bool monitor(uint32_t type, uint64_t eventid, Target target);
2563 
2564  std::map<uint64_t, Target> mIdToTarget{};
2565 
2566  // start with minimum size of 3 for read_format
2567  std::vector<uint64_t> mCounters{3};
2568  std::vector<uint64_t> mCalibratedOverhead{3};
2569  std::vector<uint64_t> mLoopOverhead{3};
2570 
2571  uint64_t mTimeEnabledNanos = 0;
2572  uint64_t mTimeRunningNanos = 0;
2573  int mFd = -1;
2574  bool mHasError = false;
2575 };
2576 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2577 
2578 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2579  if (-1 != mFd) {
2580  close(mFd);
2581  }
2582 }
2583 
2584 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2585  return monitor(PERF_TYPE_SOFTWARE, swId, target);
2586 }
2587 
2588 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2589  return monitor(PERF_TYPE_HARDWARE, hwId, target);
2590 }
2591 
2592 // overflow is ok, it's checked
2593 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2594 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2595  // clear old data
2596  for (auto& id_value : mIdToTarget) {
2597  *id_value.second.targetValue = UINT64_C(0);
2598  }
2599 
2600  if (mHasError) {
2601  return;
2602  }
2603 
2604  mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2605  mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2606 
2607  for (uint64_t i = 0; i < mCounters[0]; ++i) {
2608  auto idx = static_cast<size_t>(3 + i * 2 + 0);
2609  auto id = mCounters[idx + 1U];
2610 
2611  auto it = mIdToTarget.find(id);
2612  if (it != mIdToTarget.end()) {
2613 
2614  auto& tgt = it->second;
2615  *tgt.targetValue = mCounters[idx];
2616  if (tgt.correctMeasuringOverhead) {
2617  if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2618  *tgt.targetValue -= mCalibratedOverhead[idx];
2619  } else {
2620  *tgt.targetValue = 0U;
2621  }
2622  }
2623  if (tgt.correctLoopOverhead) {
2624  auto correctionVal = mLoopOverhead[idx] * numIters;
2625  if (*tgt.targetValue >= correctionVal) {
2626  *tgt.targetValue -= correctionVal;
2627  } else {
2628  *tgt.targetValue = 0U;
2629  }
2630  }
2631  }
2632  }
2633 }
2634 
2635 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2636  *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2637  if (mHasError) {
2638  return false;
2639  }
2640 
2641  auto pea = perf_event_attr();
2642  std::memset(&pea, 0, sizeof(perf_event_attr));
2643  pea.type = type;
2644  pea.size = sizeof(perf_event_attr);
2645  pea.config = eventid;
2646  pea.disabled = 1; // start counter as disabled
2647  pea.exclude_kernel = 1;
2648  pea.exclude_hv = 1;
2649 
2650  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2651  pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2652 
2653  const int pid = 0; // the current process
2654  const int cpu = -1; // all CPUs
2655 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2656  const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2657 # else
2658  const unsigned long flags = 0;
2659 # endif
2660 
2661  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
2662  auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2663  if (-1 == fd) {
2664  return false;
2665  }
2666  if (-1 == mFd) {
2667  // first call: set to fd, and use this from now on
2668  mFd = fd;
2669  }
2670  uint64_t id = 0;
2671  // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2672  if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2673  // couldn't get id
2674  return false;
2675  }
2676 
2677  // insert into map, rely on the fact that map's references are constant.
2678  mIdToTarget.emplace(id, target);
2679 
2680  // prepare readformat with the correct size (after the insert)
2681  auto size = 3 + 2 * mIdToTarget.size();
2682  mCounters.resize(size);
2683  mCalibratedOverhead.resize(size);
2684  mLoopOverhead.resize(size);
2685 
2686  return true;
2687 }
2688 
2689 PerformanceCounters::PerformanceCounters()
2690  : mPc(new LinuxPerformanceCounters())
2691  , mVal()
2692  , mHas() {
2693 
2694  mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2695  mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2696  mHas.contextSwitches =
2697  mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2698  mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2699  mHas.branchInstructions =
2700  mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2701  mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2702  // mHas.branchMisses = false;
2703 
2704  mPc->start();
2705  mPc->calibrate([] {
2706  auto before = ankerl::nanobench::Clock::now();
2707  auto after = ankerl::nanobench::Clock::now();
2708  (void)before;
2709  (void)after;
2710  });
2711 
2712  if (mPc->hasError()) {
2713  // something failed, don't monitor anything.
2714  mHas = PerfCountSet<bool>{};
2715  }
2716 }
2717 
2718 PerformanceCounters::~PerformanceCounters() {
2719  // no need to check for nullptr, delete nullptr has no effect
2720  delete mPc;
2721 }
2722 
2723 void PerformanceCounters::beginMeasure() {
2724  mPc->beginMeasure();
2725 }
2726 
2727 void PerformanceCounters::endMeasure() {
2728  mPc->endMeasure();
2729 }
2730 
2731 void PerformanceCounters::updateResults(uint64_t numIters) {
2732  mPc->updateResults(numIters);
2733 }
2734 
2735 # else
2736 
2737 PerformanceCounters::PerformanceCounters() = default;
2738 PerformanceCounters::~PerformanceCounters() = default;
2739 void PerformanceCounters::beginMeasure() {}
2740 void PerformanceCounters::endMeasure() {}
2741 void PerformanceCounters::updateResults(uint64_t) {}
2742 
2743 # endif
2744 
2745 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2746  return mVal;
2747 }
2748 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2749  return mHas;
2750 }
2751 
2752 // formatting utilities
2753 namespace fmt {
2754 
2755 // adds thousands separator to numbers
2756 NumSep::NumSep(char sep)
2757  : mSep(sep) {}
2758 
2759 char NumSep::do_thousands_sep() const {
2760  return mSep;
2761 }
2762 
2763 std::string NumSep::do_grouping() const {
2764  return "\003";
2765 }
2766 
2767 // RAII to save & restore a stream's state
2768 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2769  : mStream(s)
2770  , mLocale(s.getloc())
2771  , mPrecision(s.precision())
2772  , mWidth(s.width())
2773  , mFill(s.fill())
2774  , mFmtFlags(s.flags()) {}
2775 
2776 StreamStateRestorer::~StreamStateRestorer() {
2777  restore();
2778 }
2779 
2780 // sets back all stream info that we remembered at construction
2781 void StreamStateRestorer::restore() {
2782  mStream.imbue(mLocale);
2783  mStream.precision(mPrecision);
2784  mStream.width(mWidth);
2785  mStream.fill(mFill);
2786  mStream.flags(mFmtFlags);
2787 }
2788 
2789 Number::Number(int width, int precision, int64_t value)
2790  : mWidth(width)
2791  , mPrecision(precision)
2792  , mValue(static_cast<double>(value)) {}
2793 
2794 Number::Number(int width, int precision, double value)
2795  : mWidth(width)
2796  , mPrecision(precision)
2797  , mValue(value) {}
2798 
2799 std::ostream& Number::write(std::ostream& os) const {
2800  StreamStateRestorer const restorer(os);
2801  os.imbue(std::locale(os.getloc(), new NumSep(',')));
2802  os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2803  return os;
2804 }
2805 
2806 std::string Number::to_s() const {
2807  std::stringstream ss;
2808  write(ss);
2809  return ss.str();
2810 }
2811 
2812 std::string to_s(uint64_t n) {
2813  std::string str;
2814  do {
2815  str += static_cast<char>('0' + static_cast<char>(n % 10));
2816  n /= 10;
2817  } while (n != 0);
2818  std::reverse(str.begin(), str.end());
2819  return str;
2820 }
2821 
2822 std::ostream& operator<<(std::ostream& os, Number const& n) {
2823  return n.write(os);
2824 }
2825 
2826 MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val)
2827  : mWidth(w)
2828  , mPrecision(prec)
2829  , mTitle(std::move(tit))
2830  , mSuffix(std::move(suff))
2831  , mValue(val) {}
2832 
2833 std::string MarkDownColumn::title() const {
2834  std::stringstream ss;
2835  ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2836  return ss.str();
2837 }
2838 
2839 std::string MarkDownColumn::separator() const {
2840  std::string sep(static_cast<size_t>(mWidth), '-');
2841  sep.front() = '|';
2842  sep.back() = ':';
2843  return sep;
2844 }
2845 
2846 std::string MarkDownColumn::invalid() const {
2847  std::string sep(static_cast<size_t>(mWidth), ' ');
2848  sep.front() = '|';
2849  sep[sep.size() - 2] = '-';
2850  return sep;
2851 }
2852 
2853 std::string MarkDownColumn::value() const {
2854  std::stringstream ss;
2855  auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2856  ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2857  return ss.str();
2858 }
2859 
2860 // Formats any text as markdown code, escaping backticks.
2861 MarkDownCode::MarkDownCode(std::string const& what) {
2862  mWhat.reserve(what.size() + 2);
2863  mWhat.push_back('`');
2864  for (char const c : what) {
2865  mWhat.push_back(c);
2866  if ('`' == c) {
2867  mWhat.push_back('`');
2868  }
2869  }
2870  mWhat.push_back('`');
2871 }
2872 
2873 std::ostream& MarkDownCode::write(std::ostream& os) const {
2874  return os << mWhat;
2875 }
2876 
2877 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2878  return mdCode.write(os);
2879 }
2880 } // namespace fmt
2881 } // namespace detail
2882 
2883 // provide implementation here so it's only generated once
2884 Config::Config() = default;
2885 Config::~Config() = default;
2886 Config& Config::operator=(Config const&) = default;
2887 Config& Config::operator=(Config&&) noexcept = default;
2888 Config::Config(Config const&) = default;
2889 Config::Config(Config&&) noexcept = default;
2890 
2891 // provide implementation here so it's only generated once
2892 Result::~Result() = default;
2893 Result& Result::operator=(Result const&) = default;
2894 Result& Result::operator=(Result&&) noexcept = default;
2895 Result::Result(Result const&) = default;
2896 Result::Result(Result&&) noexcept = default;
2897 
2898 namespace detail {
2899 template <typename T>
2900 inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2901  return static_cast<typename std::underlying_type<T>::type>(val);
2902 }
2903 } // namespace detail
2904 
2905 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
2906 Result::Result(Config benchmarkConfig)
2907  : mConfig(std::move(benchmarkConfig))
2908  , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2909 
2910 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2911  using detail::d;
2912  using detail::u;
2913 
2914  double const dIters = d(iters);
2915  mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2916 
2917  mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2918  if (pc.has().pageFaults) {
2919  mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2920  }
2921  if (pc.has().cpuCycles) {
2922  mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2923  }
2924  if (pc.has().contextSwitches) {
2925  mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2926  }
2927  if (pc.has().instructions) {
2928  mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2929  }
2930  if (pc.has().branchInstructions) {
2931  double branchInstructions = 0.0;
2932  // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2933  if (pc.val().branchInstructions > iters + 1U) {
2934  branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2935  }
2936  mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2937 
2938  if (pc.has().branchMisses) {
2939  // correcting branch misses
2940  double branchMisses = d(pc.val().branchMisses);
2941  if (branchMisses > branchInstructions) {
2942  // can't have branch misses when there were branches...
2943  branchMisses = branchInstructions;
2944  }
2945 
2946  // assuming at least one missed branch for the loop
2947  branchMisses -= 1.0;
2948  if (branchMisses < 1.0) {
2949  branchMisses = 1.0;
2950  }
2951  mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2952  }
2953  }
2954 }
2955 
2956 Config const& Result::config() const noexcept {
2957  return mConfig;
2958 }
2959 
2960 inline double calcMedian(std::vector<double>& data) {
2961  if (data.empty()) {
2962  return 0.0;
2963  }
2964  std::sort(data.begin(), data.end());
2965 
2966  auto midIdx = data.size() / 2U;
2967  if (1U == (data.size() & 1U)) {
2968  return data[midIdx];
2969  }
2970  return (data[midIdx - 1U] + data[midIdx]) / 2U;
2971 }
2972 
2973 double Result::median(Measure m) const {
2974  // create a copy so we can sort
2975  auto data = mNameToMeasurements[detail::u(m)];
2976  return calcMedian(data);
2977 }
2978 
2979 double Result::average(Measure m) const {
2980  using detail::d;
2981  auto const& data = mNameToMeasurements[detail::u(m)];
2982  if (data.empty()) {
2983  return 0.0;
2984  }
2985 
2986  // create a copy so we can sort
2987  return sum(m) / d(data.size());
2988 }
2989 
2990 double Result::medianAbsolutePercentError(Measure m) const {
2991  // create copy
2992  auto data = mNameToMeasurements[detail::u(m)];
2993 
2994  // calculates MdAPE which is the median of percentage error
2995  // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2996  auto med = calcMedian(data);
2997 
2998  // transform the data to absolute error
2999  for (auto& x : data) {
3000  x = (x - med) / x;
3001  if (x < 0) {
3002  x = -x;
3003  }
3004  }
3005  return calcMedian(data);
3006 }
3007 
3008 double Result::sum(Measure m) const noexcept {
3009  auto const& data = mNameToMeasurements[detail::u(m)];
3010  return std::accumulate(data.begin(), data.end(), 0.0);
3011 }
3012 
3013 double Result::sumProduct(Measure m1, Measure m2) const noexcept {
3014  auto const& data1 = mNameToMeasurements[detail::u(m1)];
3015  auto const& data2 = mNameToMeasurements[detail::u(m2)];
3016 
3017  if (data1.size() != data2.size()) {
3018  return 0.0;
3019  }
3020 
3021  double result = 0.0;
3022  for (size_t i = 0, s = data1.size(); i != s; ++i) {
3023  result += data1[i] * data2[i];
3024  }
3025  return result;
3026 }
3027 
3028 bool Result::has(Measure m) const noexcept {
3029  return !mNameToMeasurements[detail::u(m)].empty();
3030 }
3031 
3032 double Result::get(size_t idx, Measure m) const {
3033  auto const& data = mNameToMeasurements[detail::u(m)];
3034  return data.at(idx);
3035 }
3036 
3037 bool Result::empty() const noexcept {
3038  return 0U == size();
3039 }
3040 
3041 size_t Result::size() const noexcept {
3042  auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
3043  return data.size();
3044 }
3045 
3046 double Result::minimum(Measure m) const noexcept {
3047  auto const& data = mNameToMeasurements[detail::u(m)];
3048  if (data.empty()) {
3049  return 0.0;
3050  }
3051 
3052  // here its save to assume that at least one element is there
3053  return *std::min_element(data.begin(), data.end());
3054 }
3055 
3056 double Result::maximum(Measure m) const noexcept {
3057  auto const& data = mNameToMeasurements[detail::u(m)];
3058  if (data.empty()) {
3059  return 0.0;
3060  }
3061 
3062  // here its save to assume that at least one element is there
3063  return *std::max_element(data.begin(), data.end());
3064 }
3065 
3066 std::string const& Result::context(char const* variableName) const {
3067  return mConfig.mContext.at(variableName);
3068 }
3069 
3070 std::string const& Result::context(std::string const& variableName) const {
3071  return mConfig.mContext.at(variableName);
3072 }
3073 
3074 Result::Measure Result::fromString(std::string const& str) {
3075  if (str == "elapsed") {
3076  return Measure::elapsed;
3077  }
3078  if (str == "iterations") {
3079  return Measure::iterations;
3080  }
3081  if (str == "pagefaults") {
3082  return Measure::pagefaults;
3083  }
3084  if (str == "cpucycles") {
3085  return Measure::cpucycles;
3086  }
3087  if (str == "contextswitches") {
3088  return Measure::contextswitches;
3089  }
3090  if (str == "instructions") {
3091  return Measure::instructions;
3092  }
3093  if (str == "branchinstructions") {
3094  return Measure::branchinstructions;
3095  }
3096  if (str == "branchmisses") {
3097  return Measure::branchmisses;
3098  }
3099  // not found, return _size
3100  return Measure::_size;
3101 }
3102 
3103 // Configuration of a microbenchmark.
3104 Bench::Bench() {
3105  mConfig.mOut = &std::cout;
3106 }
3107 
3108 Bench::Bench(Bench&&) noexcept = default;
3109 Bench& Bench::operator=(Bench&&) noexcept = default;
3110 Bench::Bench(Bench const&) = default;
3111 Bench& Bench::operator=(Bench const&) = default;
3112 Bench::~Bench() noexcept = default;
3113 
3114 double Bench::batch() const noexcept {
3115  return mConfig.mBatch;
3116 }
3117 
3118 double Bench::complexityN() const noexcept {
3119  return mConfig.mComplexityN;
3120 }
3121 
3122 // Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3123 // means it is slower than the baseline.
3124 Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3125  mConfig.mIsRelative = isRelativeEnabled;
3126  return *this;
3127 }
3128 bool Bench::relative() const noexcept {
3129  return mConfig.mIsRelative;
3130 }
3131 
3132 Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3133  mConfig.mShowPerformanceCounters = showPerformanceCounters;
3134  return *this;
3135 }
3136 bool Bench::performanceCounters() const noexcept {
3137  return mConfig.mShowPerformanceCounters;
3138 }
3139 
3140 // Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3141 // If u differs from currently set unit, the stored results will be cleared.
3142 // Use singular (byte, not bytes).
3143 Bench& Bench::unit(char const* u) {
3144  if (u != mConfig.mUnit) {
3145  mResults.clear();
3146  }
3147  mConfig.mUnit = u;
3148  return *this;
3149 }
3150 
3151 Bench& Bench::unit(std::string const& u) {
3152  return unit(u.c_str());
3153 }
3154 
3155 std::string const& Bench::unit() const noexcept {
3156  return mConfig.mUnit;
3157 }
3158 
3159 Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3160  mConfig.mTimeUnit = tu;
3161  mConfig.mTimeUnitName = tuName;
3162  return *this;
3163 }
3164 
3165 std::string const& Bench::timeUnitName() const noexcept {
3166  return mConfig.mTimeUnitName;
3167 }
3168 
3169 std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3170  return mConfig.mTimeUnit;
3171 }
3172 
3173 // If benchmarkTitle differs from currently set title, the stored results will be cleared.
3174 Bench& Bench::title(const char* benchmarkTitle) {
3175  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3176  mResults.clear();
3177  }
3178  mConfig.mBenchmarkTitle = benchmarkTitle;
3179  return *this;
3180 }
3181 Bench& Bench::title(std::string const& benchmarkTitle) {
3182  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3183  mResults.clear();
3184  }
3185  mConfig.mBenchmarkTitle = benchmarkTitle;
3186  return *this;
3187 }
3188 
3189 std::string const& Bench::title() const noexcept {
3190  return mConfig.mBenchmarkTitle;
3191 }
3192 
3193 Bench& Bench::name(const char* benchmarkName) {
3194  mConfig.mBenchmarkName = benchmarkName;
3195  return *this;
3196 }
3197 
3198 Bench& Bench::name(std::string const& benchmarkName) {
3199  mConfig.mBenchmarkName = benchmarkName;
3200  return *this;
3201 }
3202 
3203 std::string const& Bench::name() const noexcept {
3204  return mConfig.mBenchmarkName;
3205 }
3206 
3207 Bench& Bench::context(char const* variableName, char const* variableValue) {
3208  mConfig.mContext[variableName] = variableValue;
3209  return *this;
3210 }
3211 
3212 Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
3213  mConfig.mContext[variableName] = variableValue;
3214  return *this;
3215 }
3216 
3217 Bench& Bench::clearContext() {
3218  mConfig.mContext.clear();
3219  return *this;
3220 }
3221 
3222 // Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3223 Bench& Bench::epochs(size_t numEpochs) noexcept {
3224  mConfig.mNumEpochs = numEpochs;
3225  return *this;
3226 }
3227 size_t Bench::epochs() const noexcept {
3228  return mConfig.mNumEpochs;
3229 }
3230 
3231 // Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3232 Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3233  mConfig.mClockResolutionMultiple = multiple;
3234  return *this;
3235 }
3236 size_t Bench::clockResolutionMultiple() const noexcept {
3237  return mConfig.mClockResolutionMultiple;
3238 }
3239 
3240 // Sets the maximum time each epoch should take. Default is 100ms.
3241 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3242  mConfig.mMaxEpochTime = t;
3243  return *this;
3244 }
3245 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3246  return mConfig.mMaxEpochTime;
3247 }
3248 
3249 // Sets the maximum time each epoch should take. Default is 100ms.
3250 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3251  mConfig.mMinEpochTime = t;
3252  return *this;
3253 }
3254 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3255  return mConfig.mMinEpochTime;
3256 }
3257 
3258 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3259  mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3260  return *this;
3261 }
3262 uint64_t Bench::minEpochIterations() const noexcept {
3263  return mConfig.mMinEpochIterations;
3264 }
3265 
3266 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3267  mConfig.mEpochIterations = numIters;
3268  return *this;
3269 }
3270 uint64_t Bench::epochIterations() const noexcept {
3271  return mConfig.mEpochIterations;
3272 }
3273 
3274 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3275  mConfig.mWarmup = numWarmupIters;
3276  return *this;
3277 }
3278 uint64_t Bench::warmup() const noexcept {
3279  return mConfig.mWarmup;
3280 }
3281 
3282 Bench& Bench::config(Config const& benchmarkConfig) {
3283  mConfig = benchmarkConfig;
3284  return *this;
3285 }
3286 Config const& Bench::config() const noexcept {
3287  return mConfig;
3288 }
3289 
3290 Bench& Bench::output(std::ostream* outstream) noexcept {
3291  mConfig.mOut = outstream;
3292  return *this;
3293 }
3294 
3295 ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3296  return mConfig.mOut;
3297 }
3298 
3299 std::vector<Result> const& Bench::results() const noexcept {
3300  return mResults;
3301 }
3302 
3303 Bench& Bench::render(char const* templateContent, std::ostream& os) {
3304  ::ankerl::nanobench::render(templateContent, *this, os);
3305  return *this;
3306 }
3307 
3308 Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3309  ::ankerl::nanobench::render(templateContent, *this, os);
3310  return *this;
3311 }
3312 
3313 std::vector<BigO> Bench::complexityBigO() const {
3314  std::vector<BigO> bigOs;
3315  auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3316  bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3317  return 1.0;
3318  });
3319  bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3320  return n;
3321  });
3322  bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3323  return std::log2(n);
3324  });
3325  bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3326  return n * std::log2(n);
3327  });
3328  bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3329  return n * n;
3330  });
3331  bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3332  return n * n * n;
3333  });
3334  std::sort(bigOs.begin(), bigOs.end());
3335  return bigOs;
3336 }
3337 
3338 Rng::Rng()
3339  : mX(0)
3340  , mY(0) {
3341  std::random_device rd;
3342  std::uniform_int_distribution<uint64_t> dist;
3343  do {
3344  mX = dist(rd);
3345  mY = dist(rd);
3346  } while (mX == 0 && mY == 0);
3347 }
3348 
3349 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3350 uint64_t splitMix64(uint64_t& state) noexcept {
3351  uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3352  z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3353  z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3354  return z ^ (z >> 31U);
3355 }
3356 
3357 // Seeded as described in romu paper (update april 2020)
3358 Rng::Rng(uint64_t seed) noexcept
3359  : mX(splitMix64(seed))
3360  , mY(splitMix64(seed)) {
3361  for (size_t i = 0; i < 10; ++i) {
3362  operator()();
3363  }
3364 }
3365 
3366 // only internally used to copy the RNG.
3367 Rng::Rng(uint64_t x, uint64_t y) noexcept
3368  : mX(x)
3369  , mY(y) {}
3370 
3371 Rng Rng::copy() const noexcept {
3372  return Rng{mX, mY};
3373 }
3374 
3375 Rng::Rng(std::vector<uint64_t> const& data)
3376  : mX(0)
3377  , mY(0) {
3378  if (data.size() != 2) {
3379  throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3380  detail::fmt::to_s(data.size()));
3381  }
3382  mX = data[0];
3383  mY = data[1];
3384 }
3385 
3386 std::vector<uint64_t> Rng::state() const {
3387  std::vector<uint64_t> data(2);
3388  data[0] = mX;
3389  data[1] = mY;
3390  return data;
3391 }
3392 
3393 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3394  BigO::RangeMeasure rangeMeasure;
3395  for (auto const& result : results) {
3396  if (result.config().mComplexityN > 0.0) {
3397  rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3398  }
3399  }
3400  return rangeMeasure;
3401 }
3402 
3403 BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)
3404  : mName(std::move(bigOName)) {
3405 
3406  // estimate the constant factor
3407  double sumRangeMeasure = 0.0;
3408  double sumRangeRange = 0.0;
3409 
3410  for (const auto& rm : rangeMeasure) {
3411  sumRangeMeasure += rm.first * rm.second;
3412  sumRangeRange += rm.first * rm.first;
3413  }
3414  mConstant = sumRangeMeasure / sumRangeRange;
3415 
3416  // calculate root mean square
3417  double err = 0.0;
3418  double sumMeasure = 0.0;
3419  for (const auto& rm : rangeMeasure) {
3420  auto diff = mConstant * rm.first - rm.second;
3421  err += diff * diff;
3422 
3423  sumMeasure += rm.second;
3424  }
3425 
3426  auto n = static_cast<double>(rangeMeasure.size());
3427  auto mean = sumMeasure / n;
3428  mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3429 }
3430 
3431 BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3432  : BigO(std::string(bigOName), rangeMeasure) {}
3433 
3434 std::string const& BigO::name() const noexcept {
3435  return mName;
3436 }
3437 
3438 double BigO::constant() const noexcept {
3439  return mConstant;
3440 }
3441 
3442 double BigO::normalizedRootMeanSquare() const noexcept {
3443  return mNormalizedRootMeanSquare;
3444 }
3445 
3446 bool BigO::operator<(BigO const& other) const noexcept {
3447  return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3448 }
3449 
3450 std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3451  return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3452 }
3453 
3454 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3455  detail::fmt::StreamStateRestorer const restorer(os);
3456  os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3457  for (auto const& bigO : bigOs) {
3458  os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3459  os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3460  os << "| " << bigO.name();
3461  os << std::endl;
3462  }
3463  return os;
3464 }
3465 
3466 } // namespace nanobench
3467 } // namespace ankerl
3468 
3469 #endif // ANKERL_NANOBENCH_IMPLEMENT
3470 #endif // ANKERL_NANOBENCH_H_INCLUDED
int ret
int flags
Definition: bitcoin-tx.cpp:526
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:623
Bench & operator=(Bench &&other) noexcept
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1230
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & batch(T b) noexcept
Sets the batch size.
Definition: nanobench.h:1254
Bench()
Creates a new benchmark for configuration and running of benchmarks.
std::vector< BigO > complexityBigO() const
Bench(Bench &&other) noexcept
Bench & operator=(Bench const &other)
Bench(Bench const &other)
Bench & complexityN(T n) noexcept
Definition: nanobench.h:1261
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1112
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1109
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1122
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1126
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Config benchmarkConfig)
static Measure fromString(std::string const &str)
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
Result(Result &&other) noexcept
Result & operator=(Result &&other) noexcept
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept
Result(Result const &other)
Result & operator=(Result const &other)
An extremely fast random generator.
Definition: nanobench.h:484
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1187
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1177
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:489
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
IterationLogic(IterationLogic &&)=delete
IterationLogic & operator=(IterationLogic &&)=delete
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(IterationLogic const &)=delete
IterationLogic & operator=(IterationLogic const &)=delete
PerformanceCounters & operator=(PerformanceCounters const &)=delete
PerformanceCounters(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters &&)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
PerformanceCounters(PerformanceCounters &&)=delete
volatile double sum
Definition: examples.cpp:10
#define T(expected, seed, data)
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T &val)
Definition: nanobench.h:1041
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:1035
char const * json() noexcept
Template to generate JSON data.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.
char const * csv() noexcept
CSV data for the benchmark results.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:129
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1275
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:87
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:106
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:49
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:625
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:630
WalletContext context
const char * name
Definition: rest.cpp:46
static RPCHelpMan stop()
Definition: server.cpp:162
Config & operator=(Config &&other) noexcept
Config(Config const &other)
Config(Config &&other) noexcept
Config & operator=(Config const &other)
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)
Definition: testrand_impl.h:41
static int count