HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
gpu_profiler_common.h
Go to the documentation of this file.
1 #pragma once
2 
5 
6 #include <map>
7 #include <memory>
8 #include <mutex>
9 #include <sstream>
10 #include <string>
11 #include <vector>
12 #include <utility>
13 
14 namespace onnxruntime {
15 namespace profiling {
16 
17 // The classes in this header are implemented as template/inline classes
18 // to avoid having to export symbols from the main onnxruntime shared library
19 // to ExecutionProvider (EP) shared libraries.
20 // More context: The main onnxruntime shared library is optimized for size
21 // using --gc-sections during link time to ensure that any unreferenced code
22 // is not retained. This poses a problem in using a design pattern where the
23 // (abstract) base class is implemented in the main onnxruntime shared library,
24 // but (concrete) subclasses are implemented in EP shared libraries. Now, because
25 // EP shared libraries are loaded at runtime (as of 11/2022), there will be no
26 // references to the base class symbols when the main onnxruntime shared library
27 // is compiled. Thus, the base class symbols will not be included in the
28 // main onnxruntime shared library. This manifests in being unable to load
29 // EP shared libs (because the base class symbols referenced by derived
30 // classes are missing).
31 // We solve this by implementing base classes that are common to all GPU profilers
32 // inline in this header.
33 
35  public:
37  : data_(nullptr), size_(0) {}
38 
39  ProfilerActivityBuffer(const char* data, size_t size) noexcept
40  : data_(std::make_unique<char[]>(size)), size_(size) {
41  memcpy(data_.get(), data, size_);
42  }
43 
45  : ProfilerActivityBuffer(other.GetData(), other.GetSize()) {}
46 
49  std::swap(data_, other.data_);
50  std::swap(size_, other.size_);
51  }
52 
54  if (&other == this) {
55  return *this;
56  }
57 
58  new (this) ProfilerActivityBuffer{other};
59  return *this;
60  }
61 
63  if (&other == this) {
64  return *this;
65  }
66 
67  new (this) ProfilerActivityBuffer{std::move(other)};
68  return *this;
69  }
70 
71  static ProfilerActivityBuffer CreateFromPreallocatedBuffer(std::unique_ptr<char[]>&& buffer_ptr, size_t size) {
73  res.data_ = std::move(buffer_ptr);
74  res.size_ = size;
75  return res;
76  }
77 
78  // accessors
79  char* GetData() { return data_.get(); }
80  const char* GetData() const { return data_.get(); }
81  size_t GetSize() const { return size_; }
82 
83  private:
84  std::unique_ptr<char[]> data_;
85  size_t size_;
86 }; /* end class ProfilerActivityBuffer */
87 
88 template <typename TDerived>
90  public:
92  virtual ~GPUTracerManager() {}
93 
94  uint64_t RegisterClient() {
95  std::lock_guard<std::mutex> lock(manager_instance_mutex_);
96  auto res = next_client_id_++;
97  per_client_events_by_ext_correlation_.insert({res, {}});
98  ++num_active_clients_;
99  return res;
100  }
101 
102  void DeregisterClient(uint64_t client_handle) {
103  std::lock_guard<std::mutex> lock(manager_instance_mutex_);
104  auto it = per_client_events_by_ext_correlation_.find(client_handle);
105  if (it == per_client_events_by_ext_correlation_.end()) {
106  return;
107  }
108  per_client_events_by_ext_correlation_.erase(it);
109  --num_active_clients_;
110  if (num_active_clients_ == 0 && tracing_enabled_) {
111  StopLogging();
112  }
113  }
114 
115  void StartLogging() {
116  std::lock_guard<std::mutex> lock(manager_instance_mutex_);
117  if (tracing_enabled_) {
118  return;
119  }
120 
121  auto this_as_derived = static_cast<TDerived*>(this);
122  tracing_enabled_ = this_as_derived->OnStartLogging();
123  }
124 
125  void Consume(uint64_t client_handle, const TimePoint& start_time, std::map<uint64_t, Events>& events) {
126  auto this_as_derived = static_cast<TDerived*>(this);
127  events.clear();
128  {
129  // Flush any pending activity records before starting
130  // to process the accumulated activity records.
131  std::lock_guard<std::mutex> lock_manager(manager_instance_mutex_);
132  if (!tracing_enabled_) {
133  return;
134  }
135 
136  this_as_derived->FlushActivities();
137  }
138 
139  std::vector<ProfilerActivityBuffer> activity_buffers;
140  {
141  std::lock_guard<std::mutex> lock(unprocessed_activity_buffers_mutex_);
142  std::swap(unprocessed_activity_buffers_, activity_buffers);
143  unprocessed_activity_buffers_.clear();
144  }
145 
146  {
147  // Ensure that at most one thread is working through the activity buffers at any time.
148  std::lock_guard<std::mutex> lock_two(activity_buffer_processor_mutex_);
149  this_as_derived->ProcessActivityBuffers(activity_buffers, start_time);
150  auto it = per_client_events_by_ext_correlation_.find(client_handle);
151  if (it == per_client_events_by_ext_correlation_.end()) {
152  return;
153  }
154  std::swap(events, it->second);
155  }
156  }
157 
158  void PushCorrelation(uint64_t client_handle,
159  uint64_t external_correlation_id,
160  TimePoint profiling_start_time) {
161  auto this_as_derived = static_cast<TDerived*>(this);
162  std::lock_guard<std::mutex> lock(manager_instance_mutex_);
163  if (!tracing_enabled_) {
164  return;
165  }
166 
167  auto it = per_client_events_by_ext_correlation_.find(client_handle);
168  if (it == per_client_events_by_ext_correlation_.end()) {
169  // not a registered client, do nothing
170  return;
171  }
172 
173  // external_correlation_id is simply the timestamp of this event,
174  // relative to profiling_start_time. i.e., it was computed as:
175  // external_correlation_id =
176  // std::chrono::duration_cast<std::chrono::microseconds>(event_start_time - profiling_start_time).count()
177  //
178  // Because of the relative nature of the external_correlation_id, the same
179  // external_correlation_id can be reused across different clients, which then makes it
180  // impossible to recover the client from the external_correlation_id, which in turn
181  // makes it impossible to map events (which are tagged with external_correlation_id) to clients.
182  //
183  // To address these difficulties, we construct a new correlation_id (let's call it unique_cid)
184  // as follows:
185  // unique_cid =
186  // external_correlation_id +
187  // std::chrono::duration_cast<std::chrono::microseconds>(profiling_start_time.time_since_epoch()).count()
188  // now, unique_cid is monotonically increasing with time, so it can be used to reliably map events to clients.
189  //
190  // Of course, clients expect lists of events to be returned (on a call to Consume()), that are
191  // still keyed on the external_correlation_id that they've specified here, so we need to remember the
192  // offset to be subtracted
193  uint64_t offset = std::chrono::duration_cast<std::chrono::microseconds>(profiling_start_time.time_since_epoch()).count();
194  auto unique_cid = external_correlation_id + offset;
195  unique_correlation_id_to_client_offset_[unique_cid] = std::make_pair(client_handle, offset);
196  this_as_derived->PushUniqueCorrelation(unique_cid);
197  }
198 
199  void PopCorrelation(uint64_t& popped_external_correlation_id) {
200  auto this_as_derived = static_cast<TDerived*>(this);
201  std::lock_guard<std::mutex> lock(manager_instance_mutex_);
202  if (!tracing_enabled_) {
203  return;
204  }
205  uint64_t unique_cid;
206  this_as_derived->PopUniqueCorrelation(unique_cid);
207  // lookup the offset and subtract it before returning popped_external_correlation_id to the client
208  auto client_it = unique_correlation_id_to_client_offset_.find(unique_cid);
209  if (client_it == unique_correlation_id_to_client_offset_.end()) {
210  popped_external_correlation_id = 0;
211  return;
212  }
213  popped_external_correlation_id = unique_cid - client_it->second.second;
214  }
215 
216  void PopCorrelation() {
217  uint64_t unused;
218  PopCorrelation(unused);
219  }
220 
221  protected:
223  auto this_as_derived = static_cast<TDerived*>(this);
224  uint64_t gpu_ts1, gpu_ts2, cpu_ts;
225 
226  // Get the CPU and GPU timestamps to warm up
227  gpu_ts1 = this_as_derived->GetGPUTimestampInNanoseconds();
228  cpu_ts = this->GetCPUTimestampInNanoseconds();
229 
230  // Estimate the skew/offset between the CPU and GPU timestamps.
231  gpu_ts1 = this_as_derived->GetGPUTimestampInNanoseconds();
232  cpu_ts = this->GetCPUTimestampInNanoseconds();
233  gpu_ts2 = this_as_derived->GetGPUTimestampInNanoseconds();
234 
235  auto gpu_ts = (gpu_ts1 + gpu_ts2) / 2;
236  offset_to_add_to_gpu_timestamps_ = cpu_ts - gpu_ts;
237  }
238 
239 #if 0
240  // Functional API to be implemented by subclasses
241  // Included here only for documentation purposes
242 protected:
243  bool OnStartLogging();
244  void OnStopLogging();
245  void ProcessActivityBuffers(const std::vector<ProfilerActivityBuffer>& buffers,
246  const TimePoint& start_time);
247  bool PushUniqueCorrelation(uint64_t unique_cid);
248  void PopUniqueCorrelation(uint64_t& popped_unique_cid);
249  void FlushActivities();
250  uint64_t GetGPUTimestampInNanoseconds();
251 #endif
252 
254  std::lock_guard<std::mutex> lock(unprocessed_activity_buffers_mutex_);
255  unprocessed_activity_buffers_.emplace_back(std::move(buffer));
256  }
257 
258  // To be called by subclasses only from ProcessActivityBuffers
259  void MapEventToClient(uint64_t tracer_correlation_id, EventRecord&& event) {
260  auto it = tracer_correlation_to_unique_correlation_.find(tracer_correlation_id);
261  if (it == tracer_correlation_to_unique_correlation_.end()) {
262  // We're yet to receive a mapping to unique_correlation_id for this tracer_correlation_id
263  DeferEventMapping(std::move(event), tracer_correlation_id);
264  return;
265  }
266  auto unique_correlation_id = it->second;
267  auto p_event_list = GetEventListForUniqueCorrelationId(unique_correlation_id);
268  if (p_event_list != nullptr) {
269  p_event_list->emplace_back(std::move(event));
270  }
271  }
272 
273  // To be called by subclasses only from ProcessActivityBuffers
274  void NotifyNewCorrelation(uint64_t tracer_correlation_id, uint64_t unique_correlation_id) {
275  tracer_correlation_to_unique_correlation_[tracer_correlation_id] = unique_correlation_id;
276  auto pending_it = events_pending_client_mapping_.find(tracer_correlation_id);
277  if (pending_it == events_pending_client_mapping_.end()) {
278  return;
279  }
280  // Map the pending events to the right client
281  MapEventsToClient(unique_correlation_id, std::move(pending_it->second));
282  events_pending_client_mapping_.erase(pending_it);
283  }
284 
285  uint64_t NormalizeGPUTimestampToCPUEpoch(uint64_t gpu_timestamp_in_nanoseconds) {
286  return gpu_timestamp_in_nanoseconds + this->offset_to_add_to_gpu_timestamps_;
287  }
288 
289  private:
290  // Requires: manager_instance_mutex_ should be held
291  void StopLogging() {
292  auto this_as_derived = static_cast<TDerived*>(this);
293  if (!tracing_enabled_) {
294  return;
295  }
296  this_as_derived->OnStopLogging();
297  tracing_enabled_ = false;
298  Clear();
299  }
300 
301  // Requires: manager_instance_mutex_ should be held
302  void Clear() {
303  unprocessed_activity_buffers_.clear();
304  unique_correlation_id_to_client_offset_.clear();
305  per_client_events_by_ext_correlation_.clear();
306  tracer_correlation_to_unique_correlation_.clear();
307  events_pending_client_mapping_.clear();
308  }
309 
310  Events* GetEventListForUniqueCorrelationId(uint64_t unique_correlation_id) {
311  auto client_it = unique_correlation_id_to_client_offset_.find(unique_correlation_id);
312  if (client_it == unique_correlation_id_to_client_offset_.end()) {
313  return nullptr;
314  }
315 
316  // See the comments on the GetUniqueCorrelationId method for an explanation of
317  // of this offset computation and why it's required.
318  auto const& client_handle_offset = client_it->second;
319  auto external_correlation = unique_correlation_id - client_handle_offset.second;
320  auto& event_list = per_client_events_by_ext_correlation_[client_handle_offset.first][external_correlation];
321  return &event_list;
322  }
323 
324  void MapEventsToClient(uint64_t unique_correlation_id, std::vector<EventRecord>&& events) {
325  auto p_event_list = GetEventListForUniqueCorrelationId(unique_correlation_id);
326  if (p_event_list != nullptr) {
327  p_event_list->insert(p_event_list->end(),
328  std::make_move_iterator(events.begin()),
329  std::make_move_iterator(events.end()));
330  }
331  }
332 
333  void DeferEventMapping(EventRecord&& event, uint64_t tracer_correlation_id) {
334  events_pending_client_mapping_[tracer_correlation_id].emplace_back(std::move(event));
335  }
336 
337  uint64_t GetCPUTimestampInNanoseconds() {
338  return std::chrono::duration_cast<std::chrono::nanoseconds>(
339  std::chrono::high_resolution_clock::now().time_since_epoch())
340  .count();
341  }
342 
343  std::mutex manager_instance_mutex_;
344  uint64_t next_client_id_ = 1;
345  uint64_t num_active_clients_ = 0;
346  bool tracing_enabled_ = false;
347  std::mutex unprocessed_activity_buffers_mutex_;
348  std::mutex activity_buffer_processor_mutex_;
349 
350  // Unprocessed activity buffers
351  std::vector<ProfilerActivityBuffer> unprocessed_activity_buffers_;
352 
353  // Keyed on unique_correlation_id -> (client_id/client_handle, offset)
354  // unique_correlation_id - offset == external_correlation_id
355  InlinedHashMap<uint64_t, std::pair<uint64_t, uint64_t>> unique_correlation_id_to_client_offset_;
356 
357  // Keyed on tracer_correlation_id -> unique_correlation_id
358  InlinedHashMap<uint64_t, uint64_t> tracer_correlation_to_unique_correlation_;
359 
360  // client_id/client_handle -> external_correlation_id -> events
361  InlinedHashMap<uint64_t, std::map<uint64_t, Events>> per_client_events_by_ext_correlation_;
362 
363  // Keyed on tracer correlation_id, keeps track of activity records
364  // for which we haven't established the external_correlation_id yet.
365  InlinedHashMap<uint64_t, std::vector<EventRecord>> events_pending_client_mapping_;
366 
367  // An offset to add to (the possibly skewed) GPU timestamps
368  // to normalize GPU timestamps with CPU timestamps
369  int64_t offset_to_add_to_gpu_timestamps_;
370 }; /* class GPUTracerManager */
371 
372 // Base class for a GPU profiler
373 template <typename TManager>
374 class GPUProfilerBase : public EpProfiler {
375  protected:
376  GPUProfilerBase() = default;
377  virtual ~GPUProfilerBase() {}
378 
379  void MergeEvents(std::map<uint64_t, Events>& events_to_merge, Events& events) {
380  Events merged_events;
381 
382  auto event_iter = std::make_move_iterator(events.begin());
383  auto event_end = std::make_move_iterator(events.end());
384  for (auto& map_iter : events_to_merge) {
385  if (map_iter.second.empty()) {
386  continue;
387  }
388 
389  auto ts = static_cast<long long>(map_iter.first);
390 
391  // find the last occurrence of a matching timestamp,
392  // if one exists
393  while (event_iter != event_end &&
394  (event_iter->ts < ts ||
395  (event_iter->ts == ts &&
396  (event_iter + 1) != event_end &&
397  (event_iter + 1)->ts == ts))) {
398  merged_events.emplace_back(*event_iter);
399  ++event_iter;
400  }
401 
402  bool copy_op_names = false;
403  std::string op_name;
404  std::string parent_name;
405 
406  if (event_iter != event_end && event_iter->ts == ts) {
407  // We've located a parent event, copy the op_name and set
408  // this event's parent_name property to the name of the parent.
409  copy_op_names = true;
410  op_name = event_iter->args["op_name"];
411  parent_name = event_iter->name;
412  merged_events.emplace_back(*event_iter);
413  ++event_iter;
414  }
415 
416  for (auto& evt : map_iter.second) {
417  if (copy_op_names) {
418  // If we have found a matching parent event,
419  // then inherit some names from the parent.
420  evt.args["op_name"] = op_name;
421  evt.args["parent_name"] = parent_name;
422  }
423  }
424 
425  merged_events.insert(merged_events.end(),
426  std::make_move_iterator(map_iter.second.begin()),
427  std::make_move_iterator(map_iter.second.end()));
428  }
429 
430  // move any remaining events
431  merged_events.insert(merged_events.end(), event_iter, event_end);
432  std::swap(events, merged_events);
433  }
434 
435  uint64_t client_handle_;
437 
438  public:
439  virtual bool StartProfiling(TimePoint profiling_start_time) override {
440  auto& manager = TManager::GetInstance();
441  manager.StartLogging();
442  profiling_start_time_ = profiling_start_time;
443  return true;
444  }
445 
446  virtual void EndProfiling(TimePoint start_time, Events& events) override {
447  auto& manager = TManager::GetInstance();
448  std::map<uint64_t, Events> event_map;
449  manager.Consume(client_handle_, start_time, event_map);
450  MergeEvents(event_map, events);
451  }
452 
453  virtual void Start(uint64_t id) override {
454  auto& manager = TManager::GetInstance();
455  manager.PushCorrelation(client_handle_, id, profiling_start_time_);
456  }
457 
458  virtual void Stop(uint64_t) override {
459  auto& manager = TManager::GetInstance();
460  manager.PopCorrelation();
461  }
462 }; /* class GPUProfilerBase */
463 
464 // Convert a pointer to a hex string
465 static inline std::string PointerToHexString(const void* ptr) {
466  std::ostringstream sstr;
467  sstr << std::hex << ptr;
468  return sstr.str();
469 }
470 
471 } /* end namespace profiling */
472 } /* end namespace onnxruntime */
static ProfilerActivityBuffer CreateFromPreallocatedBuffer(std::unique_ptr< char[]> &&buffer_ptr, size_t size)
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GPUTracerManager)
ProfilerActivityBuffer(ProfilerActivityBuffer &&other) noexcept
GLboolean * data
Definition: glcorearb.h:131
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
Definition: UT_ArraySet.h:1639
GLsizei const GLchar *const * string
Definition: glcorearb.h:814
void MergeEvents(std::map< uint64_t, Events > &events_to_merge, Events &events)
void EnqueueActivityBuffer(ProfilerActivityBuffer &&buffer)
virtual void Start(uint64_t id) override
struct _cl_event * event
Definition: glcorearb.h:2961
void DeregisterClient(uint64_t client_handle)
GLintptr offset
Definition: glcorearb.h:665
Definition: core.h:760
void Consume(uint64_t client_handle, const TimePoint &start_time, std::map< uint64_t, Events > &events)
virtual void EndProfiling(TimePoint start_time, Events &events) override
std::chrono::high_resolution_clock::time_point TimePoint
Definition: common.h:42
const GLuint * buffers
Definition: glcorearb.h:661
void MapEventToClient(uint64_t tracer_correlation_id, EventRecord &&event)
std::vector< EventRecord > Events
ProfilerActivityBuffer(const char *data, size_t size) noexcept
void PushCorrelation(uint64_t client_handle, uint64_t external_correlation_id, TimePoint profiling_start_time)
ProfilerActivityBuffer & operator=(ProfilerActivityBuffer &&other) noexcept
GLsizeiptr size
Definition: glcorearb.h:664
void PopCorrelation(uint64_t &popped_external_correlation_id)
virtual void Stop(uint64_t) override
auto ptr(T p) -> const void *
Definition: format.h:2448
ProfilerActivityBuffer(const ProfilerActivityBuffer &other) noexcept
ProfilerActivityBuffer & operator=(const ProfilerActivityBuffer &other) noexcept
void NotifyNewCorrelation(uint64_t tracer_correlation_id, uint64_t unique_correlation_id)
virtual bool StartProfiling(TimePoint profiling_start_time) override
uint64_t NormalizeGPUTimestampToCPUEpoch(uint64_t gpu_timestamp_in_nanoseconds)
GLint GLsizei count
Definition: glcorearb.h:405
Definition: format.h:895