HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
benchmark.h
Go to the documentation of this file.
1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio
4 
5 // clang-format off
6 
7 #pragma once
8 
9 #include <iostream>
10 #include <vector>
11 
14 #include <OpenImageIO/strutil.h>
15 #include <OpenImageIO/timer.h>
16 
17 
19 
20 /// DoNotOptimize(val) is a helper function for timing benchmarks that fools
21 /// the compiler into thinking the the location 'val' is used and will not
22 /// optimize it away. For benchmarks only, do not use in production code!
23 /// May not work on all platforms. References:
24 /// * Chandler Carruth's CppCon 2015 talk
25 /// * Folly https://github.com/facebook/folly/blob/master/folly/Benchmark.h
26 /// * Google Benchmark https://github.com/google/benchmark/blob/master/include/benchmark/benchmark_api.h
27 
28 template <class T>
29 OIIO_FORCEINLINE T const& DoNotOptimize (T const &val);
30 
31 
32 /// clobber_all_memory() is a helper function for timing benchmarks that
33 /// fools the compiler into thinking that potentially any part of memory
34 /// has been modified, and thus serves as a barrier where the optimizer
35 /// won't assume anything about the state of memory preceding it.
36 
38 
39 
40 
41 /// A call to clobber(p) fools the compiler into thinking that p (or *p, for
42 /// the pointer version) might potentially have its memory altered. The
43 /// implementation actually does nothing, but it's in another module, so the
44 /// compiler won't know this and will be conservative about any assumptions
45 /// of what's in p. This is helpful for benchmarking, to help erase any
46 /// preconceptions the optimizer has about what might be in a variable.
47 
48 void OIIO_UTIL_API clobber (void* p);
49 OIIO_FORCEINLINE void clobber (const void* p) { clobber ((void*)p); }
50 
51 template<typename T>
52 OIIO_FORCEINLINE T& clobber (T& p) { clobber(&p); return p; }
53 
54 // Multi-argument clobber, added in OIIO 2.2.2
55 template<typename T, typename ...Ts>
56 OIIO_FORCEINLINE void clobber (T& p, Ts&... ps)
57 {
58  clobber(&p);
59  if (sizeof...(Ts) > 0)
60  clobber(ps...);
61 }
62 
63 
64 
65 
66 /// Benchmarker is a class to assist with "micro-benchmarking".
67 /// The goal is to discern how long it takes to run a snippet of code
68 /// (function, lambda, etc). The code will be run in some number of trials,
69 /// each consisting of many iterations, yielding statistics about the run
70 /// time of the code.
71 ///
72 /// Tne number of trials is user-selectable, with a reasonable default of 10
73 /// trials. The number of iterations per trial may be set explicitly, but
74 /// the default is to automatically compute a reasonable number of
75 /// iterations based on their timing. For most use cases, it's fire and
76 /// forget.
77 ///
78 /// Generally, the most and least expensive trials will be discarded (all
79 /// sorts of things can happen to give you a few spurious results) and then
80 /// the remainder of trials will be used to compute the average, standard
81 /// deviation, range, and median value, in ns per iteration as well as
82 /// millions of executions per second. The default behavior it just to echo
83 /// the relevant statistics to the console.
84 ///
85 /// The basic use illustrated by this example in which we try to assess
86 /// the difference in speed between acos() and fast_acos():
87 ///
88 /// Benchmarker bench;
89 /// float val = 0.5f;
90 /// clobber (val); // Scrub compiler's knowledge of the value
91 /// bench ("acos", [&](){ DoNotOptimize(std::acos(val)); });
92 /// bench ("fast_acos", [&](){ // alternate indentation style
93 /// DoNotOptimize(OIIO::fast_acos(val));
94 /// });
95 ///
96 /// Which produces output like this:
97 /// acos : 4.3 ns, 230.5 M/s (10x2097152, sdev=0.4ns rng=31.2%, med=4.6)
98 /// fast_acos : 3.4 ns, 291.2 M/s (10x2097152, sdev=0.4ns rng=33.0%, med=3.4)
99 ///
100 /// Some important details:
101 ///
102 /// After declaring the Benchmarker, a number of options can be set: number
103 /// of trials to run, iterations per trial (0 means automatic detection),
104 /// verbosity, whether (or how many) outliers to exclude. You can chain them
105 /// together if you want:
106 /// bench.iterations(10000).trials(10);
107 ///
108 /// It can be VERY hard to get valid benchmarks without the compiler messing
109 /// up your results. Some tips:
110 ///
111 /// * Code that is too fast will not be reliable. Anything that appears
112 /// to take less than 1 ns actually prints "unreliable" instead of full
113 /// stats, figuring that it is likely that it has been inadvertently
114 /// optimized away.
115 ///
116 /// * Use the DoNotOptimize() call on any final results computed by your
117 /// benchmarked code, or else the compiler is likely to remove the code
118 /// that leads to any values it thinks will never be used.
119 ///
120 /// * Beware of the compiler constant folding operations in your code --
121 /// do not pass constants unless you want to benchmark its performance on
122 /// known constants, and it is probably smart to ensure that all variables
123 /// acccessed by your code should be passed to clobber() before running
124 /// the benchmark, to confuse the compiler into not assuming its value.
125 
127 public:
129 
130  // Calling Benchmarker like a function (operator()) executes the
131  // benchmark. This process runs func(args...), several trials, each
132  // trial with many iterations. The value returned is the best estimate
133  // of the average time per iteration that it takes to run func.
134  template<typename FUNC, typename... ARGS>
135  double operator()(string_view name, FUNC func, ARGS&&... args)
136  {
137  m_name = name;
138  run(func, args...);
139  if (verbose())
140  std::cout << (*this) << std::endl;
141  return avg();
142  }
143 
144  // Return the average, sample standard deviation, median, and range
145  // of per-iteration time.
146  double avg() const { return m_avg; }
147  double stddev() const { return m_stddev; }
148  double range() const { return m_range; }
149  double median() const { return m_median; }
150 
151  // Control the number of iterations per trial. The special value 0 means
152  // to determine automatically a reasonable number of iterations. That is
153  // also the default behavior.
155  {
156  m_user_iterations = val;
157  return *this;
158  }
159  size_t iterations() const { return m_iterations; }
160 
161  // Control the number of trials to perform.
163  {
164  m_trials = val;
165  return *this;
166  }
167  size_t trials() const { return m_trials; }
168 
169  // Control the number of values of work that each iteration represents.
170  // Usually you will leave this at the default of 1, but for some cases,
171  // it may be helpful. An example of where you might use this is if you
172  // are benchmarking SIMD operations. A scalar sqrt and an SIMD sqrt may
173  // run in the same amount of time, but the SIMD version is operating on
174  // 4 (or 8, etc.) times as many values. You can use the 'work' size to
175  // make the calls report Mvals/s, showing more accurately than the SIMD
176  // call is faster than the scalar call.
178  {
179  m_work = val;
180  return *this;
181  }
182  size_t work() const { return m_work; }
183 
184  // Control the exclusion of outliers. This number (default 1) of fastest
185  // and slowest trials will be excluded from the statistics, to remove
186  // the effects of spurious things happening on the system. Setting
187  // outliers to 0 will compute statistics on all trials, without any
188  // outlier exclusion.
190  {
191  m_exclude_outliers = e;
192  return *this;
193  }
194  int exclude_outliers() const { return m_exclude_outliers; }
195 
196  // Control the verbosity of the printing for each benchmark. The default
197  // is 1, which prints basic statistics. Verbosity 0 is silent and leaves
198  // it up to the app to retrieve results.
200  {
201  m_verbose = v;
202  return *this;
203  }
204  int verbose() const { return m_verbose; }
205 
206  // Control indentation in the printout -- this number of spaces will
207  // be printed before the statistics.
208  Benchmarker& indent(int spaces)
209  {
210  m_indent = spaces;
211  return *this;
212  }
213  int indent() const { return m_indent; }
214 
215  // Choices of unit to report results.
216  enum class Unit : int { autounit, ns, us, ms, s };
217 
218  // Control the units for reporting results. By default, an appropriate
219  // unit will be chosen for nice printing of each benchmark individually.
220  // But the user may also wish to request specific units like ns or ux in
221  // order to ensure that all benchmark resutls are using the same units.
223  {
224  m_units = s;
225  return *this;
226  }
227  Unit units() const { return m_units; }
228 
229  const std::string& name() const { return m_name; }
230 
231 private:
232  size_t m_iterations = 0;
233  size_t m_user_iterations = 0;
234  size_t m_trials = 10;
235  size_t m_work = 1;
236  std::string m_name;
237  std::vector<double> m_times; // times for each trial
238  double m_avg; // average time per iteration
239  double m_stddev; // standard deviation per iteration
240  double m_range; // range per iteration
241  double m_median; // median per-iteration time
242  int m_exclude_outliers = 1;
243  int m_verbose = 1;
244  int m_indent = 0;
245  Unit m_units = Unit::autounit;
246 
247  template<typename FUNC, typename... ARGS>
248  double run(FUNC func, ARGS&&... args)
249  {
250  if (m_user_iterations)
251  m_iterations = m_user_iterations;
252  else
253  m_iterations = determine_iterations(func, args...);
254  m_times.resize(m_trials);
255 
256  double overhead = iteration_overhead() * iterations();
257  for (auto& t : m_times)
258  t = std::max(0.0, do_trial(m_iterations, func, args...) - overhead);
259  compute_stats();
260  return avg();
261  }
262 
263  template<typename FUNC, typename... ARGS>
264  size_t determine_iterations(FUNC func, ARGS&&... args)
265  {
266  // We're shooting for a trial around 1/100s
267  const double target_time = 0.01;
268  size_t i = 1;
269  while (1) {
270  double t = do_trial (i, func, args...);
271  // std::cout << "Trying " << i << " iters = " << t << "\n";
272  if (t > target_time * 1.5 && i > 2)
273  return i / 2;
274  if (t > target_time * 0.75 || i > (size_t(1) << 30))
275  return i;
276  if (t < target_time / 16)
277  i *= 8;
278  else
279  i *= 2;
280  }
281  }
282 
283  template<typename FUNC, typename... ARGS>
284  double do_trial(size_t iterations, FUNC func, ARGS&&... args)
285  {
286  Timer timer;
287  while (iterations--) {
289  func(args...);
290  }
291  return timer();
292  }
293 
294  void compute_stats() { compute_stats(m_times, m_iterations); }
295  void compute_stats(std::vector<double>& times, size_t iterations);
296  double iteration_overhead();
297 
298  friend OIIO_UTIL_API std::ostream& operator<<(std::ostream& out,
299  const Benchmarker& bench);
300 };
301 
302 
303 
304 /// Helper template that runs a function (or functor) n times, using a
305 /// Timer to benchmark the results, and returning the fastest trial. If
306 /// 'range' is non-NULL, the range (max-min) of the various time trials
307 /// will be stored there.
308 ///
309 /// DEPRECATED(1.8): This may be considered obsolete, probably the
310 /// Benchmarker class is a better solution.
311 template<typename FUNC>
312 double
313 time_trial(FUNC func, int ntrials = 1, int nrepeats = 1, double* range = NULL)
314 {
315  double mintime = 1.0e30, maxtime = 0.0;
316  while (ntrials-- > 0) {
317  Timer timer;
318  for (int i = 0; i < nrepeats; ++i) {
319  // Be sure that the repeated calls to func aren't optimized away:
321  func();
322  }
323  double t = timer();
324  if (t < mintime)
325  mintime = t;
326  if (t > maxtime)
327  maxtime = t;
328  }
329  if (range)
330  *range = maxtime - mintime;
331  return mintime;
332 }
333 
334 /// Version without repeats.
335 template<typename FUNC>
336 double
337 time_trial(FUNC func, int ntrials, double* range)
338 {
339  return time_trial(func, ntrials, 1, range);
340 }
341 
342 
343 
344 // Benchmarking helper function: Time a function with various thread counts.
345 // Inputs:
346 // task(int iterations) : The function to run (which understands an
347 // iteration count or work load).
348 // pretask() : Code to run before the task threads start.
349 // posttask() : Code to run after the task threads complete.
350 // out : Stream to print results (or NULL to not print anything).
351 // maxthreads : Don't do any trials greater than this thread count,
352 // even if it's in the threadcounts[].
353 // total_iterations : Total amount of work to do. The func() will be
354 // called with total_iterations/nthreads, so that the
355 // total work for all threads stays close to constant.
356 // ntrials : The number of runs for each thread count (more will take
357 // longer, but be more accurate timing). The best case
358 // run is the one that will be reported.
359 // threadcounts[] : An span<int> giving the set of thread counts
360 // to try.
361 // Return value:
362 // A vector<double> containing the best time (of the trials) for each
363 // thread count. This can be discarded.
364 OIIO_UTIL_API std::vector<double>
365 timed_thread_wedge (function_view<void(int)> task,
366  function_view<void()> pretask,
367  function_view<void()> posttask,
368  std::ostream *out,
369  int maxthreads,
370  int total_iterations, int ntrials,
371  cspan<int> threadcounts = {1,2,4,8,12,16,24,32,48,64,128});
372 
373 // Simplified timed_thread_wedge without pre- and post-tasks, using
374 // std::out for output, with a default set of thread counts, and not needing
375 // to return the vector of times.
376 OIIO_UTIL_API void
377 timed_thread_wedge (function_view<void(int)> task,
378  int maxthreads, int total_iterations, int ntrials,
379  cspan<int> threadcounts = {1,2,4,8,12,16,24,32,48,64,128});
380 
381 
382 
383 
384 //////////////////////////////////////////////////////////////////////////
385 //////////////////////////////////////////////////////////////////////////
386 // Implementation details...
387 //
388 
389 
390 namespace pvt {
391 void OIIO_UTIL_API use_char_ptr (char const volatile *);
392 }
393 
394 
395 #if ((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)) \
396  && (defined(__x86_64__) || defined(__i386__))
397 
398 // Major non-MS compilers on x86/x86_64: use asm trick to indicate that
399 // the value is needed.
400 template <class T>
401 OIIO_FORCEINLINE T const&
402 DoNotOptimize (T const &val) {
403 #if defined(__clang__)
404  // asm volatile("" : "+rm" (const_cast<T&>(val)));
405  // Clang doesn't like the 'X' constraint on `val` and certain GCC versions
406  // don't like the 'g' constraint. Attempt to placate them both.
407  asm volatile("" : : "g"(val) : "memory");
408 #else
409  asm volatile("" : : "i,r,m"(val) : "memory");
410 #endif
411  return val;
412 }
413 
414 #elif _MSC_VER
415 
416 // Microsoft of course has its own way of turning off optimizations.
417 #pragma optimize("", off)
418 template <class T>
419 OIIO_FORCEINLINE T const & DoNotOptimize (T const &val) {
420  pvt::use_char_ptr(&reinterpret_cast<char const volatile&>(val));
421  _ReadWriteBarrier ();
422  return val;
423 }
424 #pragma optimize("", on)
425 
426 #elif __has_attribute(__optnone__)
427 
428 // If __optnone__ attribute is available: make a null function with no
429 // optimization, that's all we need.
430 template <class T>
431 inline T const & __attribute__((__optnone__))
432 DoNotOptimize (T const &val) {
433  return val;
434 }
435 
436 #else
437 
438 // Otherwise, it won't work, just make a stub.
439 template <class T>
440 OIIO_FORCEINLINE T const & DoNotOptimize (T const &val) {
441  pvt::use_char_ptr(&reinterpret_cast<char const volatile&>(val));
442  return val;
443 }
444 
445 #endif
446 
447 
448 
449 #if ((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_COMPILER)) && (defined(__x86_64__) || defined(__i386__))
450 
451 // Special trick for x86/x86_64 and gcc-like compilers
453  asm volatile ("" : : : "memory");
454 }
455 
456 #elif _MSC_VER
457 
459  _ReadWriteBarrier ();
460 }
461 
462 #else
463 
464 // No fallback for other CPUs or compilers. Suggestions?
466 
467 #endif
468 
469 
470 
void OIIO_UTIL_API clobber(void *p)
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
Benchmarker & indent(int spaces)
Definition: benchmark.h:208
size_t trials() const
Definition: benchmark.h:167
GLenum GLint * range
Definition: glcorearb.h:1925
int verbose() const
Definition: benchmark.h:204
size_t iterations() const
Definition: benchmark.h:159
Definition: timer.h:61
#define OIIO_FORCEINLINE
Definition: platform.h:395
void OIIO_UTIL_API use_char_ptr(char const volatile *)
const GLdouble * v
Definition: glcorearb.h:837
GLsizei const GLchar *const * string
Definition: glcorearb.h:814
Definition: span.h:73
GLdouble s
Definition: glad.h:3009
String-related utilities, all in namespace Strutil.
#define OIIO_UTIL_API
Definition: export.h:71
Simple timer class.
Benchmarker & verbose(int v)
Definition: benchmark.h:199
OIIO_FORCEINLINE void clobber_all_memory()
Definition: benchmark.h:465
__attribute__((visibility("default")))
std::ostream & operator<<(std::ostream &ostr, const DataType &a)
Definition: DataType.h:133
int exclude_outliers() const
Definition: benchmark.h:194
const std::string & name() const
Definition: benchmark.h:229
Benchmarker & units(Unit s)
Definition: benchmark.h:222
double median() const
Definition: benchmark.h:149
GLuint const GLchar * name
Definition: glcorearb.h:786
double time_trial(FUNC func, int ntrials=1, int nrepeats=1, double *range=NULL)
Definition: benchmark.h:313
GLdouble t
Definition: glad.h:2397
size_t work() const
Definition: benchmark.h:182
double avg() const
Definition: benchmark.h:146
GLenum func
Definition: glcorearb.h:783
Benchmarker & work(size_t val)
Definition: benchmark.h:177
Benchmarker & iterations(size_t val)
Definition: benchmark.h:154
Benchmarker & exclude_outliers(int e)
Definition: benchmark.h:189
OIIO_NAMESPACE_BEGIN OIIO_FORCEINLINE T const & DoNotOptimize(T const &val)
Definition: benchmark.h:440
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
GLuint GLfloat * val
Definition: glcorearb.h:1608
double range() const
Definition: benchmark.h:148
double stddev() const
Definition: benchmark.h:147
**If you just want to fire and args
Definition: thread.h:609
Unit units() const
Definition: benchmark.h:227
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:94
double operator()(string_view name, FUNC func, ARGS &&...args)
Definition: benchmark.h:135
Benchmarker & trials(size_t val)
Definition: benchmark.h:162
OIIO_UTIL_API std::vector< double > timed_thread_wedge(function_view< void(int)> task, function_view< void()> pretask, function_view< void()> posttask, std::ostream *out, int maxthreads, int total_iterations, int ntrials, cspan< int > threadcounts={1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128})
int indent() const
Definition: benchmark.h:213
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:93