Open3D (C++ API)  0.19.0
Loading...
Searching...
No Matches
StdGPUHashBackend.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// Copyright (c) 2018-2024 www.open3d.org
5// SPDX-License-Identifier: MIT
6// ----------------------------------------------------------------------------
7
8#pragma once
9
10#include <stdgpu/memory.h>
11#include <stdgpu/utility.h>
12#include <thrust/device_vector.h>
13#include <thrust/transform.h>
14
15#include <stdgpu/unordered_map.cuh>
16#include <type_traits>
17
23
24namespace open3d {
25namespace core {
26
32template <typename T>
34public:
36 using value_type = T;
37
39 StdGPUAllocator() = default;
40
42 explicit StdGPUAllocator(int device_id) : std_allocator_(device_id) {}
43
46
49
52
55
57 template <typename U>
59 : std_allocator_(other.std_allocator_) {}
60
62 T* allocate(std::size_t n) {
63 T* p = std_allocator_.allocate(n);
64 stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
65 return p;
66 }
67
69 void deallocate(T* p, std::size_t n) {
70 stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
71 std_allocator_.deallocate(p, n);
72 }
73
75 bool operator==(const StdGPUAllocator& other) {
76 return std_allocator_ == other.std_allocator_;
77 }
78
80 bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
81
82private:
83 // Allow access in rebind constructor.
84 template <typename T2>
85 friend class StdGPUAllocator;
86
87 StdAllocator<T> std_allocator_;
88};
89
90// These typedefs must be defined outside of StdGPUHashBackend to make them
91// accessible in raw CUDA kernels.
92template <typename Key>
95
96template <typename Key, typename Hash, typename Eq>
98 stdgpu::unordered_map<Key,
100 Hash,
101 Eq,
103
104template <typename Key, typename Hash, typename Eq>
106public:
107 StdGPUHashBackend(int64_t init_capacity,
108 int64_t key_dsize,
109 const std::vector<int64_t>& value_dsizes,
110 const Device& device);
112
113 void Reserve(int64_t capacity) override;
114
115 void Insert(const void* input_keys,
116 const std::vector<const void*>& input_values_soa,
117 buf_index_t* output_buf_indices,
118 bool* output_masks,
119 int64_t count) override;
120
121 void Find(const void* input_keys,
122 buf_index_t* output_buf_indices,
123 bool* output_masks,
124 int64_t count) override;
125
126 void Erase(const void* input_keys,
127 bool* output_masks,
128 int64_t count) override;
129
130 int64_t GetActiveIndices(buf_index_t* output_indices) override;
131
132 void Clear() override;
133
134 int64_t Size() const override;
135
136 int64_t GetBucketCount() const override;
137 std::vector<int64_t> BucketSizes() const override;
138 float LoadFactor() const override;
139
141
142 void Allocate(int64_t capacity);
143 void Free();
144
145protected:
146 // Use reference, since the structure itself is implicitly handled as a
147 // pointer directly by stdgpu.
149
151};
152
153template <typename Key, typename Hash, typename Eq>
155 int64_t init_capacity,
156 int64_t key_dsize,
157 const std::vector<int64_t>& value_dsizes,
158 const Device& device)
159 : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
160 CUDAScopedDevice scoped_device(this->device_);
161 Allocate(init_capacity);
162}
163
164template <typename Key, typename Hash, typename Eq>
166 CUDAScopedDevice scoped_device(this->device_);
167 Free();
168}
169
170template <typename Key, typename Hash, typename Eq>
172 CUDAScopedDevice scoped_device(this->device_);
173 return impl_.size();
174}
175
176// Need an explicit kernel for non-const access to map
177template <typename Key, typename Hash, typename Eq>
179 CUDAHashBackendBufferAccessor buffer_accessor,
180 const Key* input_keys,
181 buf_index_t* output_buf_indices,
182 bool* output_masks,
183 int64_t count) {
184 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
185 if (tid >= count) return;
186
187 Key key = input_keys[tid];
188 auto iter = map.find(key);
189 bool flag = (iter != map.end());
190 output_masks[tid] = flag;
191 output_buf_indices[tid] = flag ? iter->second : 0;
192}
193
194template <typename Key, typename Hash, typename Eq>
195void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
196 buf_index_t* output_buf_indices,
197 bool* output_masks,
198 int64_t count) {
199 CUDAScopedDevice scoped_device(this->device_);
200 uint32_t threads = 128;
201 uint32_t blocks = (count + threads - 1) / threads;
202
203 STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
204 impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
205 output_buf_indices, output_masks, count);
206 cuda::Synchronize(this->device_);
207}
208
209// Need an explicit kernel for non-const access to map
210template <typename Key, typename Hash, typename Eq>
212 CUDAHashBackendBufferAccessor buffer_accessor,
213 const Key* input_keys,
214 buf_index_t* output_buf_indices,
215 bool* output_masks,
216 int64_t count) {
217 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
218 if (tid >= count) return;
219
220 Key key = input_keys[tid];
221 auto iter = map.find(key);
222 bool flag = (iter != map.end());
223 output_masks[tid] = flag;
224 output_buf_indices[tid] = flag ? iter->second : 0;
225
226 if (output_masks[tid]) {
227 output_masks[tid] = map.erase(key);
228 if (output_masks[tid]) {
229 buffer_accessor.DeviceFree(output_buf_indices[tid]);
230 }
231 }
232}
233
234template <typename Key, typename Hash, typename Eq>
235void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
236 bool* output_masks,
237 int64_t count) {
238 CUDAScopedDevice scoped_device(this->device_);
239 uint32_t threads = 128;
240 uint32_t blocks = (count + threads - 1) / threads;
241
242 core::Tensor toutput_buf_indices =
243 core::Tensor({count}, core::Int32, this->device_);
244 buf_index_t* output_buf_indices =
245 static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());
246
247 STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
248 impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
249 output_buf_indices, output_masks, count);
250 cuda::Synchronize(this->device_);
251}
252
253template <typename Key>
256 operator()(const stdgpu::pair<Key, buf_index_t>& x) const {
257 return x.second;
258 }
259};
260
261template <typename Key, typename Hash, typename Eq>
263 buf_index_t* output_indices) {
264 CUDAScopedDevice scoped_device(this->device_);
265 auto range = impl_.device_range();
266
267 thrust::transform(range.begin(), range.end(), output_indices,
269
270 return impl_.size();
271}
272
273template <typename Key, typename Hash, typename Eq>
275 CUDAScopedDevice scoped_device(this->device_);
276 impl_.clear();
277 this->buffer_->ResetHeap();
278}
279
280template <typename Key, typename Hash, typename Eq>
282 CUDAScopedDevice scoped_device(this->device_);
283}
284
285template <typename Key, typename Hash, typename Eq>
287 CUDAScopedDevice scoped_device(this->device_);
288 return impl_.bucket_count();
289}
290
291template <typename Key, typename Hash, typename Eq>
293 CUDAScopedDevice scoped_device(this->device_);
294 utility::LogError("Unimplemented");
295}
296
297template <typename Key, typename Hash, typename Eq>
299 CUDAScopedDevice scoped_device(this->device_);
300 return impl_.load_factor();
301}
302
303// Need an explicit kernel for non-const access to map
304template <typename Key, typename Hash, typename Eq, typename block_t>
305__global__ void STDGPUInsertKernel(
307 CUDAHashBackendBufferAccessor buffer_accessor,
308 const Key* input_keys,
309 const void* const* input_values_soa,
310 buf_index_t* output_buf_indices,
311 bool* output_masks,
312 int64_t count,
313 int64_t n_values) {
314 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
315 if (tid >= count) return;
316
317 Key key = input_keys[tid];
318 output_buf_indices[tid] = 0;
319 output_masks[tid] = false;
320
321 // First apply 'try insert' with a dummy index
322 auto res = map.emplace(key, 0);
323
324 // If success, change the iterator and provide the actual index
325 if (res.second) {
326 buf_index_t buf_index = buffer_accessor.DeviceAllocate();
327 auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);
328
329 // Copy templated key to buffer (duplicate)
330 // TODO: hack stdgpu inside and take out the buffer directly
331 *static_cast<Key*>(key_ptr) = key;
332
333 // Copy/reset non-templated value in buffer
334 for (int j = 0; j < n_values; ++j) {
335 const int64_t blocks_per_element =
336 buffer_accessor.value_blocks_per_element_[j];
337
338 block_t* dst_value = static_cast<block_t*>(
339 buffer_accessor.GetValuePtr(buf_index, j));
340 const block_t* src_value =
341 static_cast<const block_t*>(input_values_soa[j]) +
342 blocks_per_element * tid;
343 for (int b = 0; b < blocks_per_element; ++b) {
344 dst_value[b] = src_value[b];
345 }
346 }
347
348 // Update from the dummy index
349 res.first->second = buf_index;
350
351 // Write to return variables
352 output_buf_indices[tid] = buf_index;
353 output_masks[tid] = true;
354 }
355}
356
357template <typename Key, typename Hash, typename Eq>
359 const void* input_keys,
360 const std::vector<const void*>& input_values_soa,
361 buf_index_t* output_buf_indices,
362 bool* output_masks,
363 int64_t count) {
364 CUDAScopedDevice scoped_device(this->device_);
365 uint32_t threads = 128;
366 uint32_t blocks = (count + threads - 1) / threads;
367
368 thrust::device_vector<const void*> input_values_soa_device(
369 input_values_soa.begin(), input_values_soa.end());
370
371 int64_t n_values = input_values_soa.size();
372 const void* const* ptr_input_values_soa =
373 thrust::raw_pointer_cast(input_values_soa_device.data());
374
375 DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
376 buffer_accessor_.common_block_size_, [&]() {
377 STDGPUInsertKernel<Key, Hash, Eq, block_t>
378 <<<blocks, threads, 0, core::cuda::GetStream()>>>(
379 impl_, buffer_accessor_,
380 static_cast<const Key*>(input_keys),
381 ptr_input_values_soa, output_buf_indices,
382 output_masks, count, n_values);
383 });
384 cuda::Synchronize(this->device_);
385}
386
387template <typename Key, typename Hash, typename Eq>
389 CUDAScopedDevice scoped_device(this->device_);
390 this->capacity_ = capacity;
391
392 // Allocate buffer for key values.
393 this->buffer_ = std::make_shared<HashBackendBuffer>(
394 this->capacity_, this->key_dsize_, this->value_dsizes_,
395 this->device_);
396 buffer_accessor_.Setup(*this->buffer_);
397
398 // stdgpu initializes on the default stream. Set the current stream to
399 // ensure correct behavior.
400 {
401 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
402
404 this->capacity_,
405 InternalStdGPUHashBackendAllocator<Key>(this->device_.GetID()));
406 cuda::Synchronize(this->device_);
407 }
408}
409
410template <typename Key, typename Hash, typename Eq>
412 CUDAScopedDevice scoped_device(this->device_);
413 // Buffer is automatically handled by the smart pointer.
414 buffer_accessor_.Shutdown(this->device_);
415
416 // stdgpu initializes on the default stream. Set the current stream to
417 // ensure correct behavior.
418 {
419 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
420
422 }
423}
424} // namespace core
425} // namespace open3d
Common CUDA utilities.
#define OPEN3D_HOST_DEVICE
Definition CUDAUtils.h:44
Definition CUDAHashBackendBufferAccessor.h:24
int64_t * value_blocks_per_element_
Definition CUDAHashBackendBufferAccessor.h:108
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition CUDAHashBackendBufferAccessor.h:91
__device__ buf_index_t DeviceAllocate()
Definition CUDAHashBackendBufferAccessor.h:79
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition CUDAHashBackendBufferAccessor.h:88
__device__ void DeviceFree(buf_index_t ptr)
Definition CUDAHashBackendBufferAccessor.h:83
When CUDA is not enabled, this is a dummy class.
Definition CUDAUtils.h:214
Definition DeviceHashBackend.h:20
Device device_
Definition DeviceHashBackend.h:100
Definition Device.h:18
Definition StdAllocator.h:23
Definition StdGPUHashBackend.h:33
T * allocate(std::size_t n)
Allocates memory of size n.
Definition StdGPUHashBackend.h:62
StdGPUAllocator()=default
Default constructor.
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
StdGPUAllocator(int device_id)
Constructor from device.
Definition StdGPUHashBackend.h:42
StdGPUAllocator(const StdGPUAllocator &)=default
Default copy constructor.
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition StdGPUHashBackend.h:69
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition StdGPUHashBackend.h:75
StdGPUAllocator & operator=(StdGPUAllocator &&)=default
Default move assignment operator.
StdGPUAllocator(StdGPUAllocator &&)=default
Default move constructor.
T value_type
T.
Definition StdGPUHashBackend.h:36
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition StdGPUHashBackend.h:80
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition StdGPUHashBackend.h:58
Definition StdGPUHashBackend.h:105
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition StdGPUHashBackend.h:154
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition StdGPUHashBackend.h:235
~StdGPUHashBackend()
Definition StdGPUHashBackend.h:165
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition StdGPUHashBackend.h:298
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition StdGPUHashBackend.h:140
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition StdGPUHashBackend.h:195
void Free()
Definition StdGPUHashBackend.h:411
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition StdGPUHashBackend.h:358
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition StdGPUHashBackend.h:292
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition StdGPUHashBackend.h:148
void Reserve(int64_t capacity) override
Definition StdGPUHashBackend.h:281
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition StdGPUHashBackend.h:286
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition StdGPUHashBackend.h:262
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition StdGPUHashBackend.h:171
void Allocate(int64_t capacity)
Definition StdGPUHashBackend.h:388
CUDAHashBackendBufferAccessor buffer_accessor_
Definition StdGPUHashBackend.h:150
void Clear() override
Clear stored map without reallocating memory.
Definition StdGPUHashBackend.h:274
Definition Tensor.h:32
T * GetDataPtr()
Definition Tensor.h:1143
int count
Definition FilePCD.cpp:42
void Synchronize()
Definition CUDAUtils.cpp:58
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition StdGPUHashBackend.h:178
uint32_t buf_index_t
Definition HashBackendBuffer.h:44
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition StdGPUHashBackend.h:211
const Dtype Int32
Definition Dtype.cpp:46
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition StdGPUHashBackend.h:102
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition StdGPUHashBackend.h:305
Definition PinholeCameraIntrinsic.cpp:16
Definition StdGPUHashBackend.h:254
OPEN3D_HOST_DEVICE buf_index_t operator()(const stdgpu::pair< Key, buf_index_t > &x) const
Definition StdGPUHashBackend.h:256