HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_ParallelUtil.h File Reference
#include "UT_API.h"
#include "UT_Array.h"
#include "UT_PerformanceThread.h"
#include "UT_TaskScope.h"
#include "UT_TBBParallelInvoke.h"
#include "UT_Thread.h"
#include "UT_IteratorRange.h"
#include "UT_Optional.h"
#include <tbb/blocked_range.h>
#include <tbb/blocked_range2d.h>
#include <tbb/task_arena.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/parallel_sort.h>
#include <utility>
#include <iterator>
#include <algorithm>
+ Include dependency graph for UT_ParallelUtil.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  UT_BlockedRange< T >
 Declare prior to use. More...
 
class  UT_BlockedRange2D< RowT, ColT >
 
struct  UT_EstimatorNumItems< RANGE >
 
struct  UT_EstimatorNumItems< UT_BlockedRange2D< T > >
 
class  UT_CoarsenedRange< RANGE >
 
class  ut_TaskScopedBody< Range, Body >
 
class  ut_TaskBody< Range, Body >
 
class  ut_ForEachNumberBody< IntType, Body >
 
class  ut_TaskScopedInvokeBody< Body >
 
class  UT_ParallelInvokePointers< F1 >
 
class  UT_ParallelInvokeFunctors< F1 >
 
class  ut_ReduceTaskScopedBody< Range, Body >
 
class  UT_BlockedRange< T >
 Declare prior to use. More...
 
class  UT_BlockedRange< T >::ValueWrapper
 
class  UT_BlockedRange2D< RowT, ColT >
 
class  pss::internal::raw_buffer
 Raw memory buffer with automatic cleanup. More...
 
struct  pss::internal::parallel_merge_invoke< RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare >
 
struct  pss::internal::parallel_stable_sort_aux_invoke< RandomAccessIterator1, RandomAccessIterator2, Compare >
 

Namespaces

 pss
 
 pss::internal
 

Typedefs

typedef tbb::split UT_Split
 Typedef to denote the "split" constructor of a range. More...
 

Functions

template<typename RANGE >
size_t UTestimatedNumItems (const RANGE &range)
 This is needed by UT_CoarsenedRange. More...
 
template<typename Range , typename Body >
void UTparallelFor (const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
 
template<typename Range , typename Body >
void UTparallelForTaskScope (const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
 
template<typename Range , typename Body >
void UTparallelForLightItems (const Range &range, const Body &body, const bool force_use_task_scope=true)
 
template<typename Range , typename Body >
void UTparallelForHeavyItems (const Range &range, const Body &body)
 
template<typename IntType , typename Body >
void UTparallelForEachNumber (IntType nitems, const Body &body, const bool force_use_task_scope=true)
 
template<typename IntType , typename Body >
void UTserialForEachNumber (IntType nitems, const Body &body, bool usetaskscope=true)
 
template<typename IntType , typename Body >
void UTparallelForEachNumberTaskScope (IntType nitems, const Body &body)
 
template<typename Range , typename Body >
void UTserialFor (const Range &range, const Body &body)
 
template<typename Body >
const ut_TaskScopedInvokeBody
< Body > 
UTmakeTaskScopedInvokeBody (const Body &body)
 
template<typename F1 , typename F2 >
void UTparallelInvoke (bool parallel, F1 &&f1, F2 &&f2)
 
template<typename F1 , typename F2 , typename... Rest>
void UTparallelInvoke (bool parallel, F1 &&f1, F2 &&f2, Rest &&...rest)
 
template<typename F1 >
void UTparallelInvoke (bool parallel, const UT_Array< F1 * > &funs)
 
template<typename F1 >
void UTparallelInvoke (bool parallel, const UT_Array< F1 > &funs)
 
template<typename Range , typename Body >
void UTparallelReduce (const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
 
template<typename Range , typename Body >
void UTparallelDeterministicReduce (const Range &range, Body &body, const int grain_size, const bool force_use_task_scope=true)
 
template<typename Range , typename Body >
void UTparallelReduceLightItems (const Range &range, Body &body)
 
template<typename Range , typename Body >
void UTparallelReduceHeavyItems (const Range &range, Body &body)
 
template<typename Range , typename Body >
void UTserialReduce (const Range &range, Body &body)
 
template<typename RandomAccessIterator , typename Compare >
void UTparallelSort (RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
 
template<typename RandomAccessIterator >
void UTparallelSort (RandomAccessIterator begin, RandomAccessIterator end)
 
template<typename T >
void UTparallelSort (T *begin, T *end)
 
template<typename RandomAccessIterator , typename Compare >
void pss::parallel_stable_sort (RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
 
template<class RandomAccessIterator >
void pss::parallel_stable_sort (RandomAccessIterator xs, RandomAccessIterator xe)
 Wrapper for sorting with default comparator. More...
 
template<typename RandomAccessIterator , typename Compare >
void UTparallelStableSort (RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
 
template<typename RandomAccessIterator >
void UTparallelStableSort (RandomAccessIterator begin, RandomAccessIterator end)
 
template<typename T >
void UTparallelStableSort (T *begin, T *end)
 
template<typename T , typename Compare >
void UTparallelStableSort (T *begin, T *end, const Compare &compare)
 
template<typename T >
void UTparallelStableSort (T &a)
 
template<typename T , typename Compare >
void UTparallelStableSort (T &a, const Compare &compare)
 
template<typename Op , typename T >
void UTparallelDeterministicPrefixSumInPlace (UT_Array< T > &array, const T identity, const Op &op, const int grain_size=1024, const bool force_use_task_scope=true)
 
template<class RandomAccessIterator >
void pss::internal::serial_destroy (RandomAccessIterator zs, RandomAccessIterator ze)
 Destroy sequence [xs,xe) More...
 
template<class RandomAccessIterator1 , class RandomAccessIterator2 , class RandomAccessIterator3 , class Compare >
void pss::internal::serial_move_merge (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
 Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move. More...
 
template<typename RandomAccessIterator1 , typename RandomAccessIterator2 , typename Compare >
void pss::internal::stable_sort_base_case (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
 
template<typename RandomAccessIterator1 , typename RandomAccessIterator2 , typename RandomAccessIterator3 , typename Compare >
void pss::internal::parallel_merge (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
 
template<typename RandomAccessIterator1 , typename RandomAccessIterator2 , typename Compare >
void pss::internal::parallel_stable_sort_aux (RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
 

Typedef Documentation

typedef tbb::split UT_Split

Typedef to denote the "split" constructor of a range.

Definition at line 53 of file UT_ParallelUtil.h.

Function Documentation

template<typename RANGE >
size_t UTestimatedNumItems ( const RANGE &  range)
inline

This is needed by UT_CoarsenedRange.

Definition at line 88 of file UT_ParallelUtil.h.

template<typename Body >
const ut_TaskScopedInvokeBody<Body> UTmakeTaskScopedInvokeBody ( const Body &  body)

Takes a functor for passing to UTparallelInvoke, and wraps it in a ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in a UT_TaskScope that makes it safe to use UT_TaskLock objects that are currently locked by the parent scope.

Definition at line 493 of file UT_ParallelUtil.h.

template<typename Op , typename T >
void UTparallelDeterministicPrefixSumInPlace ( UT_Array< T > &  array,
const T  identity,
const Op &  op,
const int  grain_size = 1024,
const bool  force_use_task_scope = true 
)

Performs a prefix sum across all the entries of the array. Ie, for (int i = 1; i < array.entries(); i++) array(i) = OP(array(i-1), array(i)); tbb has this as tbb_parallel_scan but does not guarantee determinism. Note determinism is based on grain size, so that must be fixed.

Definition at line 1083 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelDeterministicReduce ( const Range &  range,
Body &  body,
const int  grain_size,
const bool  force_use_task_scope = true 
)

This is a simple wrapper for deterministic reduce that uses tbb. It works in the same manner as UTparallelReduce, with the following differences:

  • reduction and join order is deterministic (devoid of threading uncertainty;
  • a fixed grain size must be provided by the caller; grain size is not adjusted based on the available resources (this is required to satisfy determinism). This version should be used when task joining is not associative (such as accumulation of a floating point residual).

Definition at line 776 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelFor ( const Range &  range,
const Body &  body,
const int  subscribe_ratio = 2,
const int  min_grain_size = 1,
const bool  force_use_task_scope = true 
)

Run the body function over a range in parallel. UTparallelFor attempts to spread the range out over at most subscribe_ratio * num_processor tasks. The factor subscribe_ratio can be used to help balance the load. UTparallelFor() uses tbb for its implementation. The used grain size is the maximum of min_grain_size and if UTestimatedNumItems(range) / (subscribe_ratio * num_processor). If subscribe_ratio == 0, then a grain size of min_grain_size will be used. A range can be split only when UTestimatedNumItems(range) exceeds the grain size the range is divisible. Requirements for the Range functor are:

  • the requirements of the tbb Range Concept
  • UT_estimatorNumItems<Range> must return the the estimated number of work items for the range. When Range::size() is not the correct estimate, then a (partial) specialization of UT_estimatorNumItemsimatorRange must be provided for the type Range.

Requirements for the Body function are:

  • Body(const Body &);

    Copy Constructor
  • Body()::~Body();

    Destructor
  • void Body::operator()(const Range &range) const;
    Function call to perform operation on the range. Note the operator is const.

The requirements for a Range object are:

  • Range::Range(const Range&);

    Copy constructor
  • Range::~Range();

    Destructor
  • bool Range::is_divisible() const;

    True if the range can be partitioned into two sub-ranges
  • bool Range::empty() const;

    True if the range is empty
  • Range::Range(Range &r, UT_Split) const;

    Split the range r into two sub-ranges (i.e. modify r and *this)

Example:

class Square {
public:
Square(fpreal *data) : myData(data) {}
~Square();
void operator()(const UT_BlockedRange<int64> &range) const
{
for (int64 i = range.begin(); i != range.end(); ++i)
myData[i] *= myData[i];
}
fpreal *myData;
};
...
void
parallel_square(fpreal *array, int64 length)
{
UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array));
}
See Also
UTparallelReduce(), UT_BlockedRange()

Definition at line 292 of file UT_ParallelUtil.h.

template<typename IntType , typename Body >
void UTparallelForEachNumber ( IntType  nitems,
const Body &  body,
const bool  force_use_task_scope = true 
)

Version of UTparallelFor tuned for a range consists of heavy items, for example, defragmenting an entire attribute.

This approach uses "ideal" load balancing across threads and doesn't rely on the TBB task scheduler for splitting the range. Instead, it iterates from 0 to nitems, calling body with a UT_BlockedRange<IntType> containing a list of tasks to execute.

Note
The IntType must work with SYS_AtomicInt (currently int32 or int64). If you get a boost static assertion, please make sure the body range takes the proper integer type.

Definition at line 403 of file UT_ParallelUtil.h.

template<typename IntType , typename Body >
void UTparallelForEachNumberTaskScope ( IntType  nitems,
const Body &  body 
)

Version of UTparallelForEachNumber that wraps the body in a UT_TaskScope that makes it safe to use UT_TaskLock objects that are currently locked by the parent scope.

Definition at line 445 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelForHeavyItems ( const Range &  range,
const Body &  body 
)

Version of UTparallelFor that is tuned for the case where the range consists of heavy items, for example, defragmenting an entire attribute.

If possible, UTparallelForEachNumber() is preferred over use of UTparallelForHeavyItems().

Note, when the range is guaranteed to be small, you might prefer to run UTparallelFor(range, body, 0, 1). That form of the loop would guarantee that a separate task is started for each iteration of the body. However, that form can cause issues when the range gets large, in that a large number of tasks may be created.

Definition at line 380 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelForLightItems ( const Range &  range,
const Body &  body,
const bool  force_use_task_scope = true 
)

Version of UTparallelFor that is tuned for the case where the range consists of lightweight items, for example, float additions or matrix-vector multiplications.

Examples:
tetprim/GEO_PrimTetra.C.

Definition at line 359 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelForTaskScope ( const Range &  range,
const Body &  body,
const int  subscribe_ratio = 2,
const int  min_grain_size = 1 
)

Version of UTparallelFor that always creates a task scope to prevent deadlocking of child tasks that might acquire UT_TaskLocks.

Definition at line 345 of file UT_ParallelUtil.h.

template<typename F1 , typename F2 >
void UTparallelInvoke ( bool  parallel,
F1 &&  f1,
F2 &&  f2 
)
inline

UTparallelInvoke() executes the given functions in parallel when the parallel flag is true - otherwise it runs them serially. F1 and F2 should be void functors.

Definition at line 502 of file UT_ParallelUtil.h.

template<typename F1 , typename F2 , typename... Rest>
void UTparallelInvoke ( bool  parallel,
F1 &&  f1,
F2 &&  f2,
Rest &&...  rest 
)
inline

Definition at line 517 of file UT_ParallelUtil.h.

template<typename F1 >
void UTparallelInvoke ( bool  parallel,
const UT_Array< F1 * > &  funs 
)
inline

UTparallelInvoke() executes the array of functions in parallel when the parallel flag is true - otherwise it runs them serially. F1 should be a void functor.

Definition at line 551 of file UT_ParallelUtil.h.

template<typename F1 >
void UTparallelInvoke ( bool  parallel,
const UT_Array< F1 > &  funs 
)
inline

UTparallelInvoke() executes the array of functions in parallel when the parallel flag is true - otherwise it runs them serially. F1 should be a void functor.

Definition at line 585 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelReduce ( const Range &  range,
Body &  body,
const int  subscribe_ratio = 2,
const int  min_grain_size = 1,
const bool  force_use_task_scope = true 
)

UTparallelReduce() is a simple wrapper that uses tbb for its implementation. Run the body function over a range in parallel.

WARNING: The operator()() and join() functions MUST NOT initialize data! Both of these functions MUST ONLY accumulate data! This is because TBB may re-use body objects for multiple ranges. Effectively, operator()() must act as an in-place join operation for data as it comes in. Initialization must be kept to the constructors of Body.

Requirements for the Body function are:

  • Body()::~Body();

    Destructor
  • Body::Body(Body &r, UT_Split) const;

    The splitting constructor. WARNING: This must be able to run concurrently with calls to r.operator()() and r.join(), so this should not copy values accumulating in r.
  • void Body::operator()(const Range &range);
    Function call to perform operation on the range. Note the operator is not const.
  • void Body::join(const Body &other);
    Join the results from another operation with this operation. not const.

The requirements for a Range object are:

  • Range::Range(const Range&);

    Copy constructor
  • Range::~Range();

    Destructor
  • bool Range::is_divisible() const;

    True if the range can be partitioned into two sub-ranges
  • bool Range::empty() const;

    True if the range is empty
  • Range::Range(Range &r, UT_Split) const;

    Split the range r into two sub-ranges (i.e. modify r and *this)

Example:

class Dot {
public:
Dot(const fpreal *a, const fpreal *b)
: myA(a)
, myB(b)
, mySum(0)
{}
Dot(Dot &src, UT_Split)
: myA(src.myA)
, myB(src.myB)
, mySum(0)
{}
void operator()(const UT_BlockedRange<int64> &range)
{
for (int64 i = range.begin(); i != range.end(); ++i)
mySum += myA[i] * myB[i];
}
void join(const Dot &other)
{
mySum += other.mySum;
}
fpreal mySum;
const fpreal *myA, *myB;
};
parallel_dot(const fpreal *a, const fpreal *b, int64 length)
{
Dot body(a, b);
return body.mySum;
}
See Also
UTparallelFor(), UT_BlockedRange()

Definition at line 716 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelReduceHeavyItems ( const Range &  range,
Body &  body 
)

Version of UTparallelReduce that is tuned for the case where the range consists of heavy items, for example, computing the bounding box of a list of geometry objects.

Definition at line 823 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTparallelReduceLightItems ( const Range &  range,
Body &  body 
)

Version of UTparallelReduce that is tuned for the case where the range consists of lightweight items, for example, finding the min/max in a set of integers.

Definition at line 814 of file UT_ParallelUtil.h.

template<typename RandomAccessIterator , typename Compare >
void UTparallelSort ( RandomAccessIterator  begin,
RandomAccessIterator  end,
const Compare &  compare 
)

UTparallelSort() is a simple wrapper that uses tbb for its implementation.

WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability if needed.

Examples:
tetprim/GEO_PrimTetra.C.

Definition at line 847 of file UT_ParallelUtil.h.

template<typename RandomAccessIterator >
void UTparallelSort ( RandomAccessIterator  begin,
RandomAccessIterator  end 
)

UTparallelSort() is a simple wrapper that uses tbb for its implementation.

WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability if needed.

Definition at line 860 of file UT_ParallelUtil.h.

template<typename T >
void UTparallelSort ( T *  begin,
T *  end 
)

UTparallelSort() is a simple wrapper that uses tbb for its implementation.

WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability if needed.

Definition at line 873 of file UT_ParallelUtil.h.

template<typename RandomAccessIterator , typename Compare >
void UTparallelStableSort ( RandomAccessIterator  begin,
RandomAccessIterator  end,
const Compare &  compare 
)

UTparalleStableSort() is a stable parallel merge sort.

NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort. NOTE: Element initialization is done via std::move, so non-POD element types should implement c++11 move semantics.

Definition at line 904 of file UT_ParallelUtil.h.

template<typename RandomAccessIterator >
void UTparallelStableSort ( RandomAccessIterator  begin,
RandomAccessIterator  end 
)

UTparalleStableSort() is a stable parallel merge sort.

NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort. NOTE: Element initialization is done via std::move, so non-POD element types should implement c++11 move semantics.

Definition at line 917 of file UT_ParallelUtil.h.

template<typename T >
void UTparallelStableSort ( T *  begin,
T *  end 
)

UTparalleStableSort() is a stable parallel merge sort.

NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort. NOTE: Element initialization is done via std::move, so non-POD element types should implement c++11 move semantics.

Definition at line 929 of file UT_ParallelUtil.h.

template<typename T , typename Compare >
void UTparallelStableSort ( T *  begin,
T *  end,
const Compare &  compare 
)

UTparalleStableSort() is a stable parallel merge sort.

NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort. NOTE: Element initialization is done via std::move, so non-POD element types should implement c++11 move semantics.

Definition at line 941 of file UT_ParallelUtil.h.

template<typename T >
void UTparallelStableSort ( T &  a)

UTparalleStableSort() is a stable parallel merge sort. This form works with UT_Array and other containers with begin/end members.

NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort. NOTE: Element initialization is done via std::move, so non-POD element types should implement c++11 move semantics.

Definition at line 956 of file UT_ParallelUtil.h.

template<typename T , typename Compare >
void UTparallelStableSort ( T &  a,
const Compare &  compare 
)

UTparalleStableSort() is a stable parallel merge sort. This form works with UT_Array and other containers with begin/end members.

NOTE: UTparallelStableSort requires a temporary buffer of size end-begin. On allocation failure it falls back to calling std::stable_sort. NOTE: Element initialization is done via std::move, so non-POD element types should implement c++11 move semantics.

Definition at line 971 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTserialFor ( const Range &  range,
const Body &  body 
)

UTserialFor can be used as a debugging tool to quickly replace a parallel for with a serial for.

Definition at line 453 of file UT_ParallelUtil.h.

template<typename IntType , typename Body >
void UTserialForEachNumber ( IntType  nitems,
const Body &  body,
bool  usetaskscope = true 
)

UTserialForEachNumber can be used as a debugging tool to quickly replace a parallel for with a serial for.

Definition at line 434 of file UT_ParallelUtil.h.

template<typename Range , typename Body >
void UTserialReduce ( const Range &  range,
Body &  body 
)

UTserialReduce can be used as a debugging tool to quickly replace a parallel reduce with a serial for.

Definition at line 831 of file UT_ParallelUtil.h.