HDK
|
#include "UT_API.h"
#include "UT_Array.h"
#include "UT_PerformanceThread.h"
#include "UT_TaskScope.h"
#include "UT_TBBParallelInvoke.h"
#include "UT_Thread.h"
#include "UT_IteratorRange.h"
#include "UT_Optional.h"
#include <tbb/blocked_range.h>
#include <tbb/blocked_range2d.h>
#include <tbb/task_arena.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/parallel_sort.h>
#include <utility>
#include <iterator>
#include <algorithm>
Go to the source code of this file.
Classes | |
class | UT_BlockedRange< T > |
Declare prior to use. More... | |
class | UT_BlockedRange2D< RowT, ColT > |
struct | UT_EstimatorNumItems< RANGE > |
struct | UT_EstimatorNumItems< UT_BlockedRange2D< T > > |
class | UT_CoarsenedRange< RANGE > |
class | ut_TaskScopedBody< Range, Body > |
class | ut_TaskBody< Range, Body > |
class | ut_ForEachNumberBody< IntType, Body > |
class | ut_TaskScopedInvokeBody< Body > |
class | UT_ParallelInvokePointers< F1 > |
class | UT_ParallelInvokeFunctors< F1 > |
class | ut_ReduceTaskScopedBody< Range, Body > |
class | UT_BlockedRange< T > |
Declare prior to use. More... | |
class | UT_BlockedRange< T >::ValueWrapper |
class | UT_BlockedRange2D< RowT, ColT > |
class | pss::internal::raw_buffer |
Raw memory buffer with automatic cleanup. More... | |
struct | pss::internal::parallel_merge_invoke< RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare > |
struct | pss::internal::parallel_stable_sort_aux_invoke< RandomAccessIterator1, RandomAccessIterator2, Compare > |
Namespaces | |
pss | |
pss::internal | |
Typedefs | |
typedef tbb::split | UT_Split |
Typedef to denote the "split" constructor of a range. More... | |
Functions | |
template<typename RANGE > | |
size_t | UTestimatedNumItems (const RANGE &range) |
This is needed by UT_CoarsenedRange. More... | |
template<typename Range , typename Body > | |
void | UTparallelFor (const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true) |
template<typename Range , typename Body > | |
void | UTparallelForTaskScope (const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1) |
template<typename Range , typename Body > | |
void | UTparallelForLightItems (const Range &range, const Body &body, const bool force_use_task_scope=true) |
template<typename Range , typename Body > | |
void | UTparallelForHeavyItems (const Range &range, const Body &body) |
template<typename IntType , typename Body > | |
void | UTparallelForEachNumber (IntType nitems, const Body &body, const bool force_use_task_scope=true) |
template<typename IntType , typename Body > | |
void | UTserialForEachNumber (IntType nitems, const Body &body, bool usetaskscope=true) |
template<typename IntType , typename Body > | |
void | UTparallelForEachNumberTaskScope (IntType nitems, const Body &body) |
template<typename Range , typename Body > | |
void | UTserialFor (const Range &range, const Body &body) |
template<typename Body > | |
const ut_TaskScopedInvokeBody < Body > | UTmakeTaskScopedInvokeBody (const Body &body) |
template<typename F1 , typename F2 > | |
void | UTparallelInvoke (bool parallel, F1 &&f1, F2 &&f2) |
template<typename F1 , typename F2 , typename... Rest> | |
void | UTparallelInvoke (bool parallel, F1 &&f1, F2 &&f2, Rest &&...rest) |
template<typename F1 > | |
void | UTparallelInvoke (bool parallel, const UT_Array< F1 * > &funs) |
template<typename F1 > | |
void | UTparallelInvoke (bool parallel, const UT_Array< F1 > &funs) |
template<typename Range , typename Body > | |
void | UTparallelReduce (const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true) |
template<typename Range , typename Body > | |
void | UTparallelDeterministicReduce (const Range &range, Body &body, const int grain_size, const bool force_use_task_scope=true) |
template<typename Range , typename Body > | |
void | UTparallelReduceLightItems (const Range &range, Body &body) |
template<typename Range , typename Body > | |
void | UTparallelReduceHeavyItems (const Range &range, Body &body) |
template<typename Range , typename Body > | |
void | UTserialReduce (const Range &range, Body &body) |
template<typename RandomAccessIterator , typename Compare > | |
void | UTparallelSort (RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare) |
template<typename RandomAccessIterator > | |
void | UTparallelSort (RandomAccessIterator begin, RandomAccessIterator end) |
template<typename T > | |
void | UTparallelSort (T *begin, T *end) |
template<typename RandomAccessIterator , typename Compare > | |
void | pss::parallel_stable_sort (RandomAccessIterator xs, RandomAccessIterator xe, Compare comp) |
template<class RandomAccessIterator > | |
void | pss::parallel_stable_sort (RandomAccessIterator xs, RandomAccessIterator xe) |
Wrapper for sorting with default comparator. More... | |
template<typename RandomAccessIterator , typename Compare > | |
void | UTparallelStableSort (RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare) |
template<typename RandomAccessIterator > | |
void | UTparallelStableSort (RandomAccessIterator begin, RandomAccessIterator end) |
template<typename T > | |
void | UTparallelStableSort (T *begin, T *end) |
template<typename T , typename Compare > | |
void | UTparallelStableSort (T *begin, T *end, const Compare &compare) |
template<typename T > | |
void | UTparallelStableSort (T &a) |
template<typename T , typename Compare > | |
void | UTparallelStableSort (T &a, const Compare &compare) |
template<typename Op , typename T > | |
void | UTparallelDeterministicPrefixSumInPlace (UT_Array< T > &array, const T identity, const Op &op, const int grain_size=1024, const bool force_use_task_scope=true) |
template<class RandomAccessIterator > | |
void | pss::internal::serial_destroy (RandomAccessIterator zs, RandomAccessIterator ze) |
Destroy sequence [xs,xe) More... | |
template<class RandomAccessIterator1 , class RandomAccessIterator2 , class RandomAccessIterator3 , class Compare > | |
void | pss::internal::serial_move_merge (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp) |
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move. More... | |
template<typename RandomAccessIterator1 , typename RandomAccessIterator2 , typename Compare > | |
void | pss::internal::stable_sort_base_case (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) |
template<typename RandomAccessIterator1 , typename RandomAccessIterator2 , typename RandomAccessIterator3 , typename Compare > | |
void | pss::internal::parallel_merge (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp) |
template<typename RandomAccessIterator1 , typename RandomAccessIterator2 , typename Compare > | |
void | pss::internal::parallel_stable_sort_aux (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) |
typedef tbb::split UT_Split |
Typedef to denote the "split" constructor of a range.
Definition at line 53 of file UT_ParallelUtil.h.
|
inline |
This is needed by UT_CoarsenedRange.
Definition at line 88 of file UT_ParallelUtil.h.
const ut_TaskScopedInvokeBody<Body> UTmakeTaskScopedInvokeBody | ( | const Body & | body | ) |
Takes a functor for passing to UTparallelInvoke, and wraps it in a ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in a UT_TaskScope that makes it safe to use UT_TaskLock objects that are currently locked by the parent scope.
Definition at line 493 of file UT_ParallelUtil.h.
void UTparallelDeterministicPrefixSumInPlace | ( | UT_Array< T > & | array, |
const T | identity, | ||
const Op & | op, | ||
const int | grain_size = 1024 , |
||
const bool | force_use_task_scope = true |
||
) |
Performs a prefix sum across all the entries of the array. Ie, for (int i = 1; i < array.entries(); i++) array(i) = OP(array(i-1), array(i)); tbb has this as tbb_parallel_scan but does not guarantee determinism. Note determinism is based on grain size, so that must be fixed.
Definition at line 1083 of file UT_ParallelUtil.h.
void UTparallelDeterministicReduce | ( | const Range & | range, |
Body & | body, | ||
const int | grain_size, | ||
const bool | force_use_task_scope = true |
||
) |
This is a simple wrapper for deterministic reduce that uses tbb. It works in the same manner as UTparallelReduce, with the following differences:
Definition at line 776 of file UT_ParallelUtil.h.
void UTparallelFor | ( | const Range & | range, |
const Body & | body, | ||
const int | subscribe_ratio = 2 , |
||
const int | min_grain_size = 1 , |
||
const bool | force_use_task_scope = true |
||
) |
Run the body
function over a range in parallel. UTparallelFor attempts to spread the range out over at most subscribe_ratio * num_processor tasks. The factor subscribe_ratio can be used to help balance the load. UTparallelFor() uses tbb for its implementation. The used grain size is the maximum of min_grain_size and if UTestimatedNumItems(range) / (subscribe_ratio * num_processor). If subscribe_ratio == 0, then a grain size of min_grain_size will be used. A range can be split only when UTestimatedNumItems(range) exceeds the grain size the range is divisible. Requirements for the Range functor are:
Requirements for the Body function are:
The requirements for a Range object are:
r
into two sub-ranges (i.e. modify r
and *this)Example:
Definition at line 292 of file UT_ParallelUtil.h.
void UTparallelForEachNumber | ( | IntType | nitems, |
const Body & | body, | ||
const bool | force_use_task_scope = true |
||
) |
Version of UTparallelFor tuned for a range consists of heavy items, for example, defragmenting an entire attribute.
This approach uses "ideal" load balancing across threads and doesn't rely on the TBB task scheduler for splitting the range. Instead, it iterates from 0
to nitems
, calling body
with a UT_BlockedRange<IntType> containing a list of tasks to execute.
IntType
must work with SYS_AtomicInt
(currently int32 or int64). If you get a boost static assertion, please make sure the body
range takes the proper integer type. Definition at line 403 of file UT_ParallelUtil.h.
void UTparallelForEachNumberTaskScope | ( | IntType | nitems, |
const Body & | body | ||
) |
Version of UTparallelForEachNumber that wraps the body in a UT_TaskScope that makes it safe to use UT_TaskLock objects that are currently locked by the parent scope.
Definition at line 445 of file UT_ParallelUtil.h.
void UTparallelForHeavyItems | ( | const Range & | range, |
const Body & | body | ||
) |
Version of UTparallelFor that is tuned for the case where the range consists of heavy items, for example, defragmenting an entire attribute.
If possible, UTparallelForEachNumber() is preferred over use of UTparallelForHeavyItems().
Note, when the range is guaranteed to be small, you might prefer to run UTparallelFor(range, body, 0, 1)
. That form of the loop would guarantee that a separate task is started for each iteration of the body. However, that form can cause issues when the range gets large, in that a large number of tasks may be created.
Definition at line 380 of file UT_ParallelUtil.h.
void UTparallelForLightItems | ( | const Range & | range, |
const Body & | body, | ||
const bool | force_use_task_scope = true |
||
) |
Version of UTparallelFor that is tuned for the case where the range consists of lightweight items, for example, float additions or matrix-vector multiplications.
Definition at line 359 of file UT_ParallelUtil.h.
void UTparallelForTaskScope | ( | const Range & | range, |
const Body & | body, | ||
const int | subscribe_ratio = 2 , |
||
const int | min_grain_size = 1 |
||
) |
Version of UTparallelFor that always creates a task scope to prevent deadlocking of child tasks that might acquire UT_TaskLocks.
Definition at line 345 of file UT_ParallelUtil.h.
|
inline |
UTparallelInvoke() executes the given functions in parallel when the parallel flag is true - otherwise it runs them serially. F1 and F2 should be void functors.
Definition at line 502 of file UT_ParallelUtil.h.
|
inline |
Definition at line 517 of file UT_ParallelUtil.h.
|
inline |
UTparallelInvoke() executes the array of functions in parallel when the parallel flag is true - otherwise it runs them serially. F1 should be a void functor.
Definition at line 551 of file UT_ParallelUtil.h.
UTparallelInvoke() executes the array of functions in parallel when the parallel flag is true - otherwise it runs them serially. F1 should be a void functor.
Definition at line 585 of file UT_ParallelUtil.h.
void UTparallelReduce | ( | const Range & | range, |
Body & | body, | ||
const int | subscribe_ratio = 2 , |
||
const int | min_grain_size = 1 , |
||
const bool | force_use_task_scope = true |
||
) |
UTparallelReduce() is a simple wrapper that uses tbb for its implementation. Run the body
function over a range in parallel.
WARNING: The operator()()
and join()
functions MUST NOT initialize data! Both of these functions MUST ONLY accumulate data! This is because TBB may re-use body objects for multiple ranges. Effectively, operator()() must act as an in-place join operation for data as it comes in. Initialization must be kept to the constructors of Body.
Requirements for the Body function are:
r.operator()()
and r.join()
, so this should not copy values accumulating in r.The requirements for a Range object are:
r
into two sub-ranges (i.e. modify r
and *this)Example:
Definition at line 716 of file UT_ParallelUtil.h.
void UTparallelReduceHeavyItems | ( | const Range & | range, |
Body & | body | ||
) |
Version of UTparallelReduce that is tuned for the case where the range consists of heavy items, for example, computing the bounding box of a list of geometry objects.
Definition at line 823 of file UT_ParallelUtil.h.
void UTparallelReduceLightItems | ( | const Range & | range, |
Body & | body | ||
) |
Version of UTparallelReduce that is tuned for the case where the range consists of lightweight items, for example, finding the min/max in a set of integers.
Definition at line 814 of file UT_ParallelUtil.h.
void UTparallelSort | ( | RandomAccessIterator | begin, |
RandomAccessIterator | end, | ||
const Compare & | compare | ||
) |
UTparallelSort() is a simple wrapper that uses tbb for its implementation.
WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability if needed.
Definition at line 847 of file UT_ParallelUtil.h.
void UTparallelSort | ( | RandomAccessIterator | begin, |
RandomAccessIterator | end | ||
) |
UTparallelSort() is a simple wrapper that uses tbb for its implementation.
WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability if needed.
Definition at line 860 of file UT_ParallelUtil.h.
void UTparallelSort | ( | T * | begin, |
T * | end | ||
) |
UTparallelSort() is a simple wrapper that uses tbb for its implementation.
WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability if needed.
Definition at line 873 of file UT_ParallelUtil.h.
void UTparallelStableSort | ( | RandomAccessIterator | begin, |
RandomAccessIterator | end, | ||
const Compare & | compare | ||
) |
UTparalleStableSort() is a stable parallel merge sort.
NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort
. NOTE: Element initialization is done via std::move
, so non-POD element types should implement c++11 move semantics.
Definition at line 904 of file UT_ParallelUtil.h.
void UTparallelStableSort | ( | RandomAccessIterator | begin, |
RandomAccessIterator | end | ||
) |
UTparalleStableSort() is a stable parallel merge sort.
NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort
. NOTE: Element initialization is done via std::move
, so non-POD element types should implement c++11 move semantics.
Definition at line 917 of file UT_ParallelUtil.h.
void UTparallelStableSort | ( | T * | begin, |
T * | end | ||
) |
UTparalleStableSort() is a stable parallel merge sort.
NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort
. NOTE: Element initialization is done via std::move
, so non-POD element types should implement c++11 move semantics.
Definition at line 929 of file UT_ParallelUtil.h.
void UTparallelStableSort | ( | T * | begin, |
T * | end, | ||
const Compare & | compare | ||
) |
UTparalleStableSort() is a stable parallel merge sort.
NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort
. NOTE: Element initialization is done via std::move
, so non-POD element types should implement c++11 move semantics.
Definition at line 941 of file UT_ParallelUtil.h.
void UTparallelStableSort | ( | T & | a | ) |
UTparalleStableSort() is a stable parallel merge sort. This form works with UT_Array and other containers with begin/end members.
NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort
. NOTE: Element initialization is done via std::move
, so non-POD element types should implement c++11 move semantics.
Definition at line 956 of file UT_ParallelUtil.h.
void UTparallelStableSort | ( | T & | a, |
const Compare & | compare | ||
) |
UTparalleStableSort() is a stable parallel merge sort. This form works with UT_Array and other containers with begin/end members.
NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort
. NOTE: Element initialization is done via std::move
, so non-POD element types should implement c++11 move semantics.
Definition at line 971 of file UT_ParallelUtil.h.
void UTserialFor | ( | const Range & | range, |
const Body & | body | ||
) |
UTserialFor can be used as a debugging tool to quickly replace a parallel for with a serial for.
Definition at line 453 of file UT_ParallelUtil.h.
void UTserialForEachNumber | ( | IntType | nitems, |
const Body & | body, | ||
bool | usetaskscope = true |
||
) |
UTserialForEachNumber can be used as a debugging tool to quickly replace a parallel for with a serial for.
Definition at line 434 of file UT_ParallelUtil.h.
void UTserialReduce | ( | const Range & | range, |
Body & | body | ||
) |
UTserialReduce can be used as a debugging tool to quickly replace a parallel reduce with a serial for.
Definition at line 831 of file UT_ParallelUtil.h.