14 namespace onnxruntime {
37 : data_(
nullptr), size_(0) {}
40 : data_(std::make_unique<
char[]>(
size)), size_(
size) {
41 memcpy(data_.get(),
data, size_);
73 res.data_ = std::move(buffer_ptr);
80 const char*
GetData()
const {
return data_.get(); }
84 std::unique_ptr<char[]> data_;
88 template <
typename TDerived>
95 std::lock_guard<std::mutex> lock(manager_instance_mutex_);
96 auto res = next_client_id_++;
97 per_client_events_by_ext_correlation_.insert({res, {}});
98 ++num_active_clients_;
103 std::lock_guard<std::mutex> lock(manager_instance_mutex_);
104 auto it = per_client_events_by_ext_correlation_.find(client_handle);
105 if (it == per_client_events_by_ext_correlation_.end()) {
108 per_client_events_by_ext_correlation_.erase(it);
109 --num_active_clients_;
110 if (num_active_clients_ == 0 && tracing_enabled_) {
116 std::lock_guard<std::mutex> lock(manager_instance_mutex_);
117 if (tracing_enabled_) {
121 auto this_as_derived =
static_cast<TDerived*
>(
this);
122 tracing_enabled_ = this_as_derived->OnStartLogging();
125 void Consume(uint64_t client_handle,
const TimePoint& start_time, std::map<uint64_t, Events>& events) {
126 auto this_as_derived =
static_cast<TDerived*
>(
this);
131 std::lock_guard<std::mutex> lock_manager(manager_instance_mutex_);
132 if (!tracing_enabled_) {
136 this_as_derived->FlushActivities();
139 std::vector<ProfilerActivityBuffer> activity_buffers;
141 std::lock_guard<std::mutex> lock(unprocessed_activity_buffers_mutex_);
142 std::swap(unprocessed_activity_buffers_, activity_buffers);
143 unprocessed_activity_buffers_.clear();
148 std::lock_guard<std::mutex> lock_two(activity_buffer_processor_mutex_);
149 this_as_derived->ProcessActivityBuffers(activity_buffers, start_time);
150 auto it = per_client_events_by_ext_correlation_.find(client_handle);
151 if (it == per_client_events_by_ext_correlation_.end()) {
159 uint64_t external_correlation_id,
161 auto this_as_derived =
static_cast<TDerived*
>(
this);
162 std::lock_guard<std::mutex> lock(manager_instance_mutex_);
163 if (!tracing_enabled_) {
167 auto it = per_client_events_by_ext_correlation_.find(client_handle);
168 if (it == per_client_events_by_ext_correlation_.end()) {
193 uint64_t
offset = std::chrono::duration_cast<std::chrono::microseconds>(profiling_start_time.time_since_epoch()).
count();
194 auto unique_cid = external_correlation_id +
offset;
195 unique_correlation_id_to_client_offset_[unique_cid] = std::make_pair(client_handle, offset);
196 this_as_derived->PushUniqueCorrelation(unique_cid);
200 auto this_as_derived =
static_cast<TDerived*
>(
this);
201 std::lock_guard<std::mutex> lock(manager_instance_mutex_);
202 if (!tracing_enabled_) {
206 this_as_derived->PopUniqueCorrelation(unique_cid);
208 auto client_it = unique_correlation_id_to_client_offset_.find(unique_cid);
209 if (client_it == unique_correlation_id_to_client_offset_.end()) {
210 popped_external_correlation_id = 0;
213 popped_external_correlation_id = unique_cid - client_it->second.second;
223 auto this_as_derived =
static_cast<TDerived*
>(
this);
224 uint64_t gpu_ts1, gpu_ts2, cpu_ts;
227 gpu_ts1 = this_as_derived->GetGPUTimestampInNanoseconds();
228 cpu_ts = this->GetCPUTimestampInNanoseconds();
231 gpu_ts1 = this_as_derived->GetGPUTimestampInNanoseconds();
232 cpu_ts = this->GetCPUTimestampInNanoseconds();
233 gpu_ts2 = this_as_derived->GetGPUTimestampInNanoseconds();
235 auto gpu_ts = (gpu_ts1 + gpu_ts2) / 2;
236 offset_to_add_to_gpu_timestamps_ = cpu_ts - gpu_ts;
243 bool OnStartLogging();
244 void OnStopLogging();
245 void ProcessActivityBuffers(
const std::vector<ProfilerActivityBuffer>&
buffers,
247 bool PushUniqueCorrelation(uint64_t unique_cid);
248 void PopUniqueCorrelation(uint64_t& popped_unique_cid);
249 void FlushActivities();
250 uint64_t GetGPUTimestampInNanoseconds();
254 std::lock_guard<std::mutex> lock(unprocessed_activity_buffers_mutex_);
255 unprocessed_activity_buffers_.emplace_back(std::move(
buffer));
260 auto it = tracer_correlation_to_unique_correlation_.find(tracer_correlation_id);
261 if (it == tracer_correlation_to_unique_correlation_.end()) {
263 DeferEventMapping(std::move(
event), tracer_correlation_id);
266 auto unique_correlation_id = it->second;
267 auto p_event_list = GetEventListForUniqueCorrelationId(unique_correlation_id);
268 if (p_event_list !=
nullptr) {
269 p_event_list->emplace_back(std::move(
event));
275 tracer_correlation_to_unique_correlation_[tracer_correlation_id] = unique_correlation_id;
276 auto pending_it = events_pending_client_mapping_.find(tracer_correlation_id);
277 if (pending_it == events_pending_client_mapping_.end()) {
281 MapEventsToClient(unique_correlation_id, std::move(pending_it->second));
282 events_pending_client_mapping_.erase(pending_it);
286 return gpu_timestamp_in_nanoseconds + this->offset_to_add_to_gpu_timestamps_;
292 auto this_as_derived =
static_cast<TDerived*
>(
this);
293 if (!tracing_enabled_) {
296 this_as_derived->OnStopLogging();
297 tracing_enabled_ =
false;
303 unprocessed_activity_buffers_.clear();
304 unique_correlation_id_to_client_offset_.clear();
305 per_client_events_by_ext_correlation_.clear();
306 tracer_correlation_to_unique_correlation_.clear();
307 events_pending_client_mapping_.clear();
310 Events* GetEventListForUniqueCorrelationId(uint64_t unique_correlation_id) {
311 auto client_it = unique_correlation_id_to_client_offset_.find(unique_correlation_id);
312 if (client_it == unique_correlation_id_to_client_offset_.end()) {
318 auto const& client_handle_offset = client_it->second;
319 auto external_correlation = unique_correlation_id - client_handle_offset.second;
320 auto& event_list = per_client_events_by_ext_correlation_[client_handle_offset.first][external_correlation];
324 void MapEventsToClient(uint64_t unique_correlation_id, std::vector<EventRecord>&& events) {
325 auto p_event_list = GetEventListForUniqueCorrelationId(unique_correlation_id);
326 if (p_event_list !=
nullptr) {
327 p_event_list->insert(p_event_list->end(),
328 std::make_move_iterator(events.begin()),
329 std::make_move_iterator(events.end()));
333 void DeferEventMapping(EventRecord&&
event, uint64_t tracer_correlation_id) {
334 events_pending_client_mapping_[tracer_correlation_id].emplace_back(std::move(
event));
337 uint64_t GetCPUTimestampInNanoseconds() {
338 return std::chrono::duration_cast<std::chrono::nanoseconds>(
339 std::chrono::high_resolution_clock::now().time_since_epoch())
343 std::mutex manager_instance_mutex_;
344 uint64_t next_client_id_ = 1;
345 uint64_t num_active_clients_ = 0;
346 bool tracing_enabled_ =
false;
347 std::mutex unprocessed_activity_buffers_mutex_;
348 std::mutex activity_buffer_processor_mutex_;
351 std::vector<ProfilerActivityBuffer> unprocessed_activity_buffers_;
355 InlinedHashMap<uint64_t, std::pair<uint64_t, uint64_t>> unique_correlation_id_to_client_offset_;
358 InlinedHashMap<uint64_t, uint64_t> tracer_correlation_to_unique_correlation_;
361 InlinedHashMap<uint64_t, std::map<uint64_t, Events>> per_client_events_by_ext_correlation_;
365 InlinedHashMap<uint64_t, std::vector<EventRecord>> events_pending_client_mapping_;
369 int64_t offset_to_add_to_gpu_timestamps_;
373 template <
typename TManager>
382 auto event_iter = std::make_move_iterator(events.begin());
383 auto event_end = std::make_move_iterator(events.end());
384 for (
auto& map_iter : events_to_merge) {
385 if (map_iter.second.empty()) {
389 auto ts =
static_cast<long long>(map_iter.first);
393 while (event_iter != event_end &&
394 (event_iter->ts < ts ||
395 (event_iter->ts == ts &&
396 (event_iter + 1) != event_end &&
397 (event_iter + 1)->ts == ts))) {
398 merged_events.emplace_back(*event_iter);
402 bool copy_op_names =
false;
406 if (event_iter != event_end && event_iter->ts == ts) {
409 copy_op_names =
true;
410 op_name = event_iter->args[
"op_name"];
411 parent_name = event_iter->name;
412 merged_events.emplace_back(*event_iter);
416 for (
auto& evt : map_iter.second) {
420 evt.args[
"op_name"] = op_name;
421 evt.args[
"parent_name"] = parent_name;
425 merged_events.insert(merged_events.end(),
426 std::make_move_iterator(map_iter.second.begin()),
427 std::make_move_iterator(map_iter.second.end()));
431 merged_events.insert(merged_events.end(), event_iter, event_end);
440 auto& manager = TManager::GetInstance();
441 manager.StartLogging();
447 auto& manager = TManager::GetInstance();
448 std::map<uint64_t, Events> event_map;
453 virtual void Start(uint64_t
id)
override {
454 auto& manager = TManager::GetInstance();
458 virtual void Stop(uint64_t)
override {
459 auto& manager = TManager::GetInstance();
460 manager.PopCorrelation();
466 std::ostringstream sstr;
467 sstr << std::hex <<
ptr;
static ProfilerActivityBuffer CreateFromPreallocatedBuffer(std::unique_ptr< char[]> &&buffer_ptr, size_t size)
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GPUTracerManager)
ProfilerActivityBuffer(ProfilerActivityBuffer &&other) noexcept
GPUProfilerBase()=default
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
GLsizei const GLchar *const * string
uint64_t RegisterClient()
void MergeEvents(std::map< uint64_t, Events > &events_to_merge, Events &events)
void EnqueueActivityBuffer(ProfilerActivityBuffer &&buffer)
virtual void Start(uint64_t id) override
void DeregisterClient(uint64_t client_handle)
ProfilerActivityBuffer() noexcept
virtual ~GPUTracerManager()
void Consume(uint64_t client_handle, const TimePoint &start_time, std::map< uint64_t, Events > &events)
virtual void EndProfiling(TimePoint start_time, Events &events) override
std::chrono::high_resolution_clock::time_point TimePoint
void MapEventToClient(uint64_t tracer_correlation_id, EventRecord &&event)
std::vector< EventRecord > Events
ProfilerActivityBuffer(const char *data, size_t size) noexcept
void PushCorrelation(uint64_t client_handle, uint64_t external_correlation_id, TimePoint profiling_start_time)
ProfilerActivityBuffer & operator=(ProfilerActivityBuffer &&other) noexcept
const char * GetData() const
void PopCorrelation(uint64_t &popped_external_correlation_id)
virtual void Stop(uint64_t) override
ProfilerActivityBuffer(const ProfilerActivityBuffer &other) noexcept
ProfilerActivityBuffer & operator=(const ProfilerActivityBuffer &other) noexcept
void NotifyNewCorrelation(uint64_t tracer_correlation_id, uint64_t unique_correlation_id)
virtual ~GPUProfilerBase()
TimePoint profiling_start_time_
virtual bool StartProfiling(TimePoint profiling_start_time) override
uint64_t NormalizeGPUTimestampToCPUEpoch(uint64_t gpu_timestamp_in_nanoseconds)