HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaDeviceBuffer.h
Go to the documentation of this file.
1 // Copyright Contributors to the OpenVDB Project
2 // SPDX-License-Identifier: MPL-2.0
3 
4 /*!
5  \file CudaDeviceBuffer.h
6 
7  \author Ken Museth
8 
9  \date January 8, 2020
10 
11  \brief Implements a simple dual (host/device) CUDA buffer.
12 
13  \note This file has no device-only (kernel) function calls,
14  which explains why it's a .h and not .cuh file.
15 */
16 
17 #ifndef NANOVDB_CUDA_DEVICE_BUFFER_H_HAS_BEEN_INCLUDED
18 #define NANOVDB_CUDA_DEVICE_BUFFER_H_HAS_BEEN_INCLUDED
19 
20 #include "../HostBuffer.h" // for BufferTraits
21 #include "CudaUtils.h"// for cudaMalloc/cudaMallocManaged/cudaFree
22 
23 namespace nanovdb {
24 
25 // ----------------------------> CudaDeviceBuffer <--------------------------------------
26 
27 /// @brief Simple memory buffer using un-managed pinned host memory when compiled with NVCC.
28 /// Obviously this class is making explicit used of CUDA so replace it with your own memory
29 /// allocator if you are not using CUDA.
30 /// @note While CUDA's pinned host memory allows for asynchronous memory copy between host and device
31 /// it is significantly slower then cached (un-pinned) memory on the host.
33 {
34 
35  uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
36  uint8_t *mCpuData, *mGpuData; // raw pointers to the host and device buffers
37 
38 public:
39  /// @brief Static factory method that return an instance of this buffer
40  /// @param size byte size of buffer to be initialized
41  /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
42  /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
43  /// @param stream optional stream argument (defaults to stream NULL)
44  /// @return An instance of this class using move semantics
45  static CudaDeviceBuffer create(uint64_t size, const CudaDeviceBuffer* dummy = nullptr, bool host = true, void* stream = nullptr);
46 
47  /// @brief Constructor
48  /// @param size byte size of buffer to be initialized
49  /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
50  /// @param stream optional stream argument (defaults to stream NULL)
51  CudaDeviceBuffer(uint64_t size = 0, bool host = true, void* stream = nullptr)
52  : mSize(0)
53  , mCpuData(nullptr)
54  , mGpuData(nullptr)
55  {
56  if (size > 0) this->init(size, host, stream);
57  }
58 
59  /// @brief Disallow copy-construction
60  CudaDeviceBuffer(const CudaDeviceBuffer&) = delete;
61 
62  /// @brief Move copy-constructor
64  : mSize(other.mSize)
65  , mCpuData(other.mCpuData)
66  , mGpuData(other.mGpuData)
67  {
68  other.mSize = 0;
69  other.mCpuData = nullptr;
70  other.mGpuData = nullptr;
71  }
72 
73  /// @brief Disallow copy assignment operation
75 
76  /// @brief Move copy assignment operation
78  {
79  this->clear();
80  mSize = other.mSize;
81  mCpuData = other.mCpuData;
82  mGpuData = other.mGpuData;
83  other.mSize = 0;
84  other.mCpuData = nullptr;
85  other.mGpuData = nullptr;
86  return *this;
87  }
88 
89  /// @brief Destructor frees memory on both the host and device
90  ~CudaDeviceBuffer() { this->clear(); };
91 
92  /// @brief Initialize buffer
93  /// @param size byte size of buffer to be initialized
94  /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
95  /// @note All existing buffers are first cleared
96  /// @warning size is expected to be non-zero. Use clear() clear buffer!
97  void init(uint64_t size, bool host = true, void* stream = nullptr);
98 
99  /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
100  /// @warning Note that the pointer can be NULL!
101  uint8_t* data() const { return mCpuData; }
102 
103  /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
104  /// @warning Note that the pointer can be NULL!
105  uint8_t* deviceData() const { return mGpuData; }
106 
107  /// @brief Upload this buffer from the host to the device, i.e. CPU -> GPU.
108  /// @param stream optional CUDA stream (defaults to CUDA stream 0)
109  /// @param sync if false the memory copy is asynchronous
110  /// @note If the device/GPU buffer does not exist it is first allocated
111  /// @warning Assumes that the host/CPU buffer already exists
112  void deviceUpload(void* stream = nullptr, bool sync = true) const;
113 
114  /// @brief Upload this buffer from the device to the host, i.e. GPU -> CPU.
115  /// @param stream optional CUDA stream (defaults to CUDA stream 0)
116  /// @param sync if false the memory copy is asynchronous
117  /// @note If the host/CPU buffer does not exist it is first allocated
118  /// @warning Assumes that the device/GPU buffer already exists
119  void deviceDownload(void* stream = nullptr, bool sync = true) const;
120 
121  /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
122  uint64_t size() const { return mSize; }
123 
124  //@{
125  /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
126  bool empty() const { return mSize == 0; }
127  bool isEmpty() const { return mSize == 0; }
128  //@}
129 
130  /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
131  void clear(void* stream = nullptr);
132 
133 }; // CudaDeviceBuffer class
134 
135 template<>
137 {
138  static constexpr bool hasDeviceDual = true;
139 };
140 
141 // --------------------------> Implementations below <------------------------------------
142 
143 inline CudaDeviceBuffer CudaDeviceBuffer::create(uint64_t size, const CudaDeviceBuffer*, bool host, void* stream)
144 {
145  return CudaDeviceBuffer(size, host, stream);
146 }
147 
148 inline void CudaDeviceBuffer::init(uint64_t size, bool host, void* stream)
149 {
150  if (mSize>0) this->clear(stream);
151  NANOVDB_ASSERT(size > 0);
152  if (host) {
153  cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
154  checkPtr(mCpuData, "CudaDeviceBuffer::init: failed to allocate host buffer");
155  } else {
156  cudaCheck(cudaMallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
157  checkPtr(mGpuData, "CudaDeviceBuffer::init: failed to allocate device buffer");
158  }
159  mSize = size;
160 } // CudaDeviceBuffer::init
161 
162 inline void CudaDeviceBuffer::deviceUpload(void* stream, bool sync) const
163 {
164  checkPtr(mCpuData, "uninitialized cpu data");
165  if (mGpuData == nullptr) {
166  cudaCheck(cudaMallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
167  }
168  checkPtr(mGpuData, "uninitialized gpu data");
169  cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
170  if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
171 } // CudaDeviceBuffer::gpuUpload
172 
173 inline void CudaDeviceBuffer::deviceDownload(void* stream, bool sync) const
174 {
175  checkPtr(mGpuData, "uninitialized gpu data");
176  if (mCpuData == nullptr) {
177  cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
178  }
179  checkPtr(mCpuData, "uninitialized cpu data");
180  cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
181  if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
182 } // CudaDeviceBuffer::gpuDownload
183 
184 inline void CudaDeviceBuffer::clear(void *stream)
185 {
186  if (mGpuData) cudaCheck(cudaFreeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
187  if (mCpuData) cudaCheck(cudaFreeHost(mCpuData));
188  mCpuData = mGpuData = nullptr;
189  mSize = 0;
190 } // CudaDeviceBuffer::clear
191 
192 } // namespace nanovdb
193 
194 #endif // end of NANOVDB_CUDA_DEVICE_BUFFER_H_HAS_BEEN_INCLUDED
GLuint GLuint stream
Definition: glcorearb.h:1832
uint64_t size() const
Returns the size in bytes of the raw memory buffer managed by this allocator.
CudaDeviceBuffer(uint64_t size=0, bool host=true, void *stream=nullptr)
Constructor.
void clear(void *stream=nullptr)
De-allocate all memory managed by this allocator and set all pointers to NULL.
cudaError_t cudaFreeAsync(void *d_ptr, cudaStream_t)
Dummy implementation of cudaFreeAsync that calls cudaFree.
Definition: CudaUtils.h:69
#define checkPtr(ptr, msg)
Definition: CE_VDBCreate.h:49
CudaDeviceBuffer & operator=(CudaDeviceBuffer &&other) noexcept
Move copy assignment operation.
void init(uint64_t size, bool host=true, void *stream=nullptr)
Initialize buffer.
#define NANOVDB_ASSERT(x)
Definition: NanoVDB.h:190
bool isEmpty() const
Returns true if this allocator is empty, i.e. has no allocated memory.
~CudaDeviceBuffer()
Destructor frees memory on both the host and device.
bool empty() const
Returns true if this allocator is empty, i.e. has no allocated memory.
Simple memory buffer using un-managed pinned host memory when compiled with NVCC. Obviously this clas...
#define cudaCheck(ans)
Definition: CudaUtils.h:36
cudaError_t cudaMallocAsync(void **d_ptr, size_t size, cudaStream_t)
Dummy implementation of cudaMallocAsync that calls cudaMalloc.
Definition: CudaUtils.h:63
constexpr enabler dummy
An instance to use in EnableIf.
Definition: CLI11.h:985
static constexpr bool hasDeviceDual
Definition: HostBuffer.h:101
uint8_t * data() const
Retuns a raw pointer to the host/CPU buffer managed by this allocator.
void deviceUpload(void *stream=nullptr, bool sync=true) const
Upload this buffer from the host to the device, i.e. CPU -> GPU.
GLsizeiptr size
Definition: glcorearb.h:664
CudaDeviceBuffer(CudaDeviceBuffer &&other) noexcept
Move copy-constructor.
CudaDeviceBuffer & operator=(const CudaDeviceBuffer &)=delete
Disallow copy assignment operation.
uint8_t * deviceData() const
Retuns a raw pointer to the device/GPU buffer managed by this allocator.
void deviceDownload(void *stream=nullptr, bool sync=true) const
Upload this buffer from the device to the host, i.e. GPU -> CPU.
static CudaDeviceBuffer create(uint64_t size, const CudaDeviceBuffer *dummy=nullptr, bool host=true, void *stream=nullptr)
Static factory method that return an instance of this buffer.