Actual source code: memory.cxx
1: #include <petsc/private/deviceimpl.h>
3: #include <petsc/private/cpp/register_finalize.hpp>
4: #include <petsc/private/cpp/type_traits.hpp>
6: #include <unordered_map>
7: #include <algorithm> // std::find_if
8: #include <cstring> // std::memset
10: const char *const PetscDeviceCopyModes[] = {"host_to_host", "device_to_host", "host_to_device", "device_to_device", "auto", "PetscDeviceCopyMode", "PETSC_DEVICE_COPY_", nullptr};
11: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOH) == 0, "");
12: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOH) == 1, "");
13: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOD) == 2, "");
14: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOD) == 3, "");
15: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_AUTO) == 4, "");
17: // ==========================================================================================
18: // MemoryMap
19: //
20: // Since the pointers allocated via PetscDeviceAllocate_Private() may be device pointers we
21: // cannot just store meta-data within the pointer itself (as we can't dereference them). So
22: // instead we need to keep an extra map to keep track of them
23: //
24: // Each entry maps pointer -> {
25: // PetscMemType - The memtype of the pointer
26: // PetscObjectId - A unique ID assigned at allocation or registratrion so auto-dep can
27: // identify the pointer
28: // size - The size (in bytes) of the allocation
29: // }
30: // ==========================================================================================
32: // GCC implementation for std::hash<T*>. LLVM's libc++ is almost 2x slower because they do all
33: // kinds of complicated murmur hashing, so we make sure to enforce GCC's version.
34: struct PointerHash {
35: template <typename T>
36: PETSC_NODISCARD std::size_t operator()(const T *ptr) const noexcept
37: {
38: return reinterpret_cast<std::size_t>(ptr);
39: }
40: };
42: class MemoryMap : public Petsc::RegisterFinalizeable<MemoryMap> {
43: public:
44: struct PointerAttributes {
45: PetscMemType mtype{}; // memtype of allocation
46: PetscObjectId id{}; // id of allocation
47: std::size_t size{}; // size of allocation (bytes)
49: // even though this is a POD and can be aggregate initialized, the STL uses () constructors
50: // in unordered_map and so we need to provide a trivial contructor...
51: constexpr PointerAttributes(PetscMemType, PetscObjectId, std::size_t) noexcept;
52: constexpr PointerAttributes() noexcept = default;
53: constexpr PointerAttributes(const PointerAttributes &) noexcept = default;
54: PETSC_CONSTEXPR_14 PointerAttributes &operator=(const PointerAttributes &) noexcept = default;
55: constexpr PointerAttributes(PointerAttributes &&) noexcept = default;
56: PETSC_CONSTEXPR_14 PointerAttributes &operator=(PointerAttributes &&) noexcept = default;
58: bool operator==(const PointerAttributes &) const noexcept;
60: PETSC_NODISCARD bool contains(const void *, const void *) const noexcept;
61: };
63: using map_type = std::unordered_map<void *, PointerAttributes, PointerHash>;
65: map_type map;
67: // return the iterator of the allocation containing ptr, or map.cend() if not found
68: PETSC_NODISCARD map_type::const_iterator search_for(const void *, bool = false) const noexcept;
70: private:
71: friend class Petsc::RegisterFinalizeable<MemoryMap>;
72: PETSC_NODISCARD PetscErrorCode register_finalize_() noexcept;
73: PETSC_NODISCARD PetscErrorCode finalize_() noexcept;
74: };
76: // ==========================================================================================
77: // PointerAttributes
78: // ==========================================================================================
80: constexpr MemoryMap::PointerAttributes::PointerAttributes(PetscMemType mtype_, PetscObjectId id_, std::size_t size_) noexcept : mtype(mtype_), id(id_), size(size_) { }
82: bool MemoryMap::PointerAttributes::operator==(const PointerAttributes &other) const noexcept
83: {
84: return mtype == other.mtype && id == other.id && size == other.size;
85: }
87: bool MemoryMap::PointerAttributes::contains(const void *ptr_begin, const void *ptr) const noexcept
88: {
89: return (ptr >= ptr_begin) && (ptr < (static_cast<const char *>(ptr_begin) + size));
90: }
92: // ==========================================================================================
93: // Memory map - Private API
94: // ==========================================================================================
96: PetscErrorCode MemoryMap::register_finalize_() noexcept
97: {
98: // Preallocate, this does give a modest performance bump since unordered_map is so __dog__
99: // slow if it needs to rehash. Experiments show that users tend not to have more than 5 or
100: // so concurrently live pointers lying around. 10 at most.
101: map.reserve(16);
102: return 0;
103: }
105: PetscErrorCode MemoryMap::finalize_() noexcept
106: {
107: PetscInfo(nullptr, "Finalizing memory map\n");
108: map = map_type{};
109: return 0;
110: }
112: // ==========================================================================================
113: // Memory map - Public API
114: // ==========================================================================================
116: /*
117: MemoryMap::search_for - retrieve an iterator to the key-value pair for a pointer in the map
119: Input Parameters:
120: + ptr - pointer to search for
121: - must_find - true if an error is raised if the pointer is not found (default: false)
123: Notes:
124: Accounts for sub-regions, i.e. if ptr is contained within another pointers region, it returns
125: the iterator to the super-pointers key-value pair.
127: If ptr is not found and must_find is false returns map.end(), otherwise raises an error
128: */
129: MemoryMap::map_type::const_iterator MemoryMap::search_for(const void *ptr, bool must_find) const noexcept
130: {
131: const auto end = map.end();
132: auto it = map.find(const_cast<map_type::key_type>(ptr));
134: // ptr was found, and points to an entire block
135: if (it != end) return it;
136: // wasn't found, but maybe its part of a block. have to search every block for it
137: // clang-format off
138: it = std::find_if(map.begin(), end, [ptr](const map_type::const_iterator::value_type &map_it) {
139: return map_it.second.contains(map_it.first, ptr);
140: });
142: return it;
143: // clang-format on
144: }
146: static MemoryMap memory_map;
148: // ==========================================================================================
149: // Utility functions
150: // ==========================================================================================
152: static PetscErrorCode PetscDeviceCheckCapable_Private(PetscDeviceContext dctx, bool cond, const char descr[])
153: {
155: return 0;
156: }
158: // A helper utility, since register is called from PetscDeviceRegisterMemory() and
159: // PetscDevicAllocate(). The latter also needs the generated id, so instead of making it search
160: // the map again we just return it here
161: static PetscErrorCode PetscDeviceRegisterMemory_Private(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size, PetscObjectId *PETSC_RESTRICT id = nullptr)
162: {
163: auto &map = memory_map.map;
164: const auto it = memory_map.search_for(ptr);
166: if (it == map.cend()) {
167: // pointer was never registered with the map, insert it and bail
168: const auto newid = PetscObjectNewId_Internal();
170: if (PetscDefined(USE_DEBUG)) {
171: const auto tmp = MemoryMap::PointerAttributes(mtype, newid, size);
173: for (const auto &entry : map) {
174: // REVIEW ME: maybe this should just be handled...
176: entry.first, PetscMemTypeToString(entry.second.mtype), entry.second.size);
177: }
178: }
179: // clang-format off
180: if (id) *id = newid;
181: PetscCallCXX(map.emplace(
182: std::piecewise_construct,
183: std::forward_as_tuple(const_cast<MemoryMap::map_type::key_type>(ptr)),
184: std::forward_as_tuple(mtype, newid, size)
185: ));
186: // clang-format on
187: return 0;
188: }
189: if (PetscDefined(USE_DEBUG)) {
190: const auto &old = it->second;
193: PetscMemTypeToString(old.mtype), old.size, old.id, PetscMemTypeToString(mtype), size, old.id);
194: }
195: if (id) *id = it->second.id;
196: return 0;
197: }
199: /*@C
200: PetscDeviceRegisterMemory - Register a pointer for use with device-aware memory system
202: Not Collective
204: Input Parameters:
205: + ptr - The pointer to register
206: . mtype - The `PetscMemType` of the pointer
207: - size - The size (in bytes) of the memory region
209: Notes:
210: `ptr` need not point to the beginning of the memory range, however the user should register
211: the
213: It's OK to re-register the same `ptr` repeatedly (subsequent registrations do nothing)
214: however the given `mtype` and `size` must match the original registration.
216: `size` may be 0 (in which case this routine does nothing).
218: Level: intermediate
220: .seealso: `PetscDeviceMalloc()`, `PetscDeviceArrayCopy()`, `PetscDeviceFree()`,
221: `PetscDeviceArrayZero()`
222: @*/
223: PetscErrorCode PetscDeviceRegisterMemory(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size)
224: {
226: if (PetscUnlikely(!size)) return 0; // there is no point registering empty range
227: PetscDeviceRegisterMemory_Private(ptr, mtype, size);
228: return 0;
229: }
231: /*
232: PetscDeviceAllocate_Private - Allocate device-aware memory
234: Not Collective, Asynchronous, Auto-dependency aware
236: Input Parameters:
237: + dctx - The `PetscDeviceContext` used to allocate the memory
238: . clear - Whether or not the memory should be zeroed
239: . mtype - The type of memory to allocate
240: . n - The amount (in bytes) to allocate
241: - alignment - The alignment requirement (in bytes) of the allocated pointer
243: Output Parameter:
244: . ptr - The pointer to store the result in
246: Notes:
247: The user should prefer `PetscDeviceMalloc()` over this routine as it automatically computes
248: the size of the allocation and alignment based on the size of the datatype.
250: If the user is unsure about `alignment` -- or unable to compute it -- passing
251: `PETSC_MEMALIGN` will always work, though the user should beware that this may be quite
252: wasteful for very small allocations.
254: Memory allocated with this function must be freed with `PetscDeviceFree()` (or
255: `PetscDeviceDeallocate_Private()`).
257: If `n` is zero, then `ptr` is set to `PETSC_NULLPTR`.
259: This routine falls back to using `PetscMalloc1()` or `PetscCalloc1()` (depending on the value
260: of `clear`) if PETSc was not configured with device support. The user should note that
261: `mtype` and `alignment` are ignored in this case, as these routines allocate only host memory
262: aligned to `PETSC_MEMALIGN`.
264: Note result stored `ptr` is immediately valid and the user may freely inspect or manipulate
265: its value on function return, i.e.\:
267: .vb
268: PetscInt *ptr;
270: PetscDeviceAllocate_Private(dctx, PETSC_FALSE, PETSC_MEMTYPE_DEVICE, 20, alignof(PetscInt), (void**)&ptr);
272: PetscInt *sub_ptr = ptr + 10; // OK, no need to synchronize
274: ptr[0] = 10; // ERROR, directly accessing contents of ptr is undefined until synchronization
275: .ve
277: DAG representation:
278: .vb
279: time ->
281: -> dctx - |= CALL =| -\- dctx -->
282: \- ptr ->
283: .ve
285: Level: intermediate
287: .N ASYNC_API
289: .seealso: `PetscDeviceMalloc()`, `PetscDeviceFree()`, `PetscDeviceDeallocate_Private()`,
290: `PetscDeviceArrayCopy()`, `PetscDeviceArrayZero()`, `PetscMemType`
291: */
292: PetscErrorCode PetscDeviceAllocate_Private(PetscDeviceContext dctx, PetscBool clear, PetscMemType mtype, std::size_t n, std::size_t alignment, void **PETSC_RESTRICT ptr)
293: {
294: PetscObjectId id = 0;
296: if (PetscDefined(USE_DEBUG)) {
297: const auto is_power_of_2 = [](std::size_t num) { return (num & (num - 1)) == 0; };
301: }
303: *ptr = nullptr;
304: if (PetscUnlikely(!n)) return 0;
305: memory_map.register_finalize();
306: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
308: // get our pointer here
309: if (dctx->ops->memalloc) {
310: PetscUseTypeMethod(dctx, memalloc, clear, mtype, n, alignment, ptr);
311: } else {
312: PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "allocating");
313: PetscMallocA(1, clear, __LINE__, PETSC_FUNCTION_NAME, __FILE__, n, ptr);
314: }
315: PetscDeviceRegisterMemory_Private(*ptr, mtype, n, &id);
316: // Note this is a "write" so that the next dctx to try and read from the pointer has to wait
317: // for the allocation to be ready
318: PetscDeviceContextMarkIntentFromID(dctx, id, PETSC_MEMORY_ACCESS_WRITE, "memory allocation");
319: return 0;
320: }
322: /*
323: PetscDeviceDeallocate_Private - Free device-aware memory
325: Not Collective, Asynchronous, Auto-dependency aware
327: Input Parameters:
328: + dctx - The `PetscDeviceContext` used to free the memory
329: - ptr - The pointer to free
331: Notes:
332: `ptr` must have been allocated using any of `PetscDeviceMalloc()`, `PetscDeviceCalloc()` or
333: `PetscDeviceAllocate_Private()`, or registered with the system via `PetscDeviceRegisterMemory()`.
335: The user should prefer `PetscDeviceFree()` over this routine as it automatically sets `ptr`
336: to `PETSC_NULLPTR` on successful deallocation.
338: `ptr` may be `NULL`.
340: This routine falls back to using `PetscFree()` if PETSc was not configured with device
341: support. The user should note that `PetscFree()` frees only host memory.
343: DAG representation:
344: .vb
345: time ->
347: -> dctx -/- |= CALL =| - dctx ->
348: -> ptr -/
349: .ve
351: Level: intermediate
353: .N ASYNC_API
355: .seealso: `PetscDeviceFree()`, `PetscDeviceAllocate_Private()`
356: */
357: PetscErrorCode PetscDeviceDeallocate_Private(PetscDeviceContext dctx, void *PETSC_RESTRICT ptr)
358: {
359: if (ptr) {
360: auto &map = memory_map.map;
361: const auto found_it = map.find(const_cast<MemoryMap::map_type::key_type>(ptr));
363: if (PetscUnlikelyDebug(found_it == map.end())) {
364: // OK this is a bad pointer, now determine why
365: const auto it = memory_map.search_for(ptr);
367: // if it is map.cend() then no allocation owns it, meaning it was not allocated by us!
369: // if we are here then we did allocate it but the user has tried to do something along
370: // the lines of:
371: //
372: // allocate(&ptr, size);
373: // deallocate(ptr+5);
374: //
375: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Attempting to deallocate pointer %p which is a suballocation of %p (memtype %s, id %" PetscInt64_FMT ", size %zu bytes)", ptr, it->first, PetscMemTypeToString(it->second.mtype), it->second.id,
376: it->second.size);
377: }
379: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
380: // mark intent BEFORE we free, note we mark as write so that we are made to wait on any
381: // outstanding reads (don't want to kill the pointer before they are done)
382: PetscDeviceContextMarkIntentFromID(dctx, found_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory deallocation");
383: // do free
384: if (dctx->ops->memfree) {
385: PetscUseTypeMethod(dctx, memfree, found_it->second.mtype, (void **)&ptr);
386: } else {
387: PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(found_it->second.mtype), "freeing");
388: }
389: // if ptr still exists, then the device context could not handle it
390: if (ptr) PetscFree(ptr);
391: map.erase(found_it);
392: }
393: return 0;
394: }
396: /*@C
397: PetscDeviceMemcpy - Copy memory in a device-aware manner
399: Not Collective, Asynchronous, Auto-dependency aware
401: Input Parameters:
402: + dctx - The `PetscDeviceContext` used to copy the memory
403: . dest - The pointer to copy to
404: . src - The pointer to copy from
405: - n - The amount (in bytes) to copy
407: Notes:
408: Both `dest` and `src` must have been allocated by `PetscDeviceMalloc()` or
409: `PetscDeviceCalloc()`.
411: `src` and `dest` cannot overlap.
413: If both `src` and `dest` are on the host this routine is fully synchronous.
415: The user should prefer `PetscDeviceArrayCopy()` over this routine as it automatically
416: computes the number of bytes to copy from the size of the pointer types.
418: DAG representation:
419: .vb
420: time ->
422: -> dctx - |= CALL =| - dctx ->
423: -> dest --------------------->
424: -> src ---------------------->
425: .ve
427: Level: intermediate
429: .N ASYNC_API
431: .seealso: `PetscDeviceArrayCopy()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
432: `PetscDeviceFree()`
433: @*/
434: PetscErrorCode PetscDeviceMemcpy(PetscDeviceContext dctx, void *PETSC_RESTRICT dest, const void *PETSC_RESTRICT src, std::size_t n)
435: {
436: if (!n) return 0;
439: if (dest == src) return 0;
440: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
441: {
442: const auto dest_it = memory_map.search_for(dest, true);
443: const auto src_it = memory_map.search_for(src, true);
444: const auto mode = PetscMemTypeToDeviceCopyMode(dest_it->second.mtype, src_it->second.mtype);
446: PetscDeviceContextMarkIntentFromID(dctx, src_it->second.id, PETSC_MEMORY_ACCESS_READ, "memory copy (src)");
447: PetscDeviceContextMarkIntentFromID(dctx, dest_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory copy (dest)");
448: // perform the copy
449: if (dctx->ops->memcopy) {
450: PetscUseTypeMethod(dctx, memcopy, dest, src, n, mode);
451: if (mode == PETSC_DEVICE_COPY_HTOD) {
452: PetscLogCpuToGpu(n);
453: } else if (mode == PETSC_DEVICE_COPY_DTOH) {
454: PetscLogGpuToCpu(n);
455: }
456: } else {
457: // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
458: // (pinned) but being copied by a host dctx
459: PetscDeviceCheckCapable_Private(dctx, mode == PETSC_DEVICE_COPY_HTOH, "copying");
460: PetscMemcpy(dest, src, n);
461: }
462: }
463: return 0;
464: }
466: /*@C
467: PetscDeviceMemset - Memset device-aware memory
469: Not Collective, Asynchronous, Auto-dependency aware
471: Input Parameters:
472: + dctx - The `PetscDeviceContext` used to memset the memory
473: . ptr - The pointer to the memory
474: . v - The value to set
475: - n - The amount (in bytes) to set
477: Notes:
478: `ptr` must have been allocated by `PetscDeviceMalloc()` or `PetscDeviceCalloc()`.
480: The user should prefer `PetscDeviceArrayZero()` over this routine as it automatically
481: computes the number of bytes to copy from the size of the pointer types, though they should
482: note that it only zeros memory.
484: This routine is analogous to `memset()`. That is, this routine copies the value
485: `static_cast<unsigned char>(v)` into each of the first count characters of the object pointed
486: to by `dest`.
488: If `dest` is on device, this routine is asynchronous.
490: DAG representation:
491: .vb
492: time ->
494: -> dctx - |= CALL =| - dctx ->
495: -> dest --------------------->
496: .ve
498: Level: intermediate
500: .N ASYNC_API
502: .seealso: `PetscDeviceArrayZero()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
503: `PetscDeviceFree()`
504: @*/
505: PetscErrorCode PetscDeviceMemset(PetscDeviceContext dctx, void *ptr, PetscInt v, std::size_t n)
506: {
507: if (PetscUnlikely(!n)) return 0;
509: PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
510: {
511: const auto ptr_it = memory_map.search_for(ptr, true);
512: const auto mtype = ptr_it->second.mtype;
514: PetscDeviceContextMarkIntentFromID(dctx, ptr_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory set");
515: if (dctx->ops->memset) {
516: PetscUseTypeMethod(dctx, memset, mtype, ptr, v, n);
517: } else {
518: // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
519: // (pinned) but being memset by a host dctx
520: PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "memsetting");
521: std::memset(ptr, static_cast<int>(v), n);
522: }
523: }
524: return 0;
525: }