Actual source code: memory.cxx

  1: #include <petsc/private/deviceimpl.h>

  3: #include <petsc/private/cpp/register_finalize.hpp>
  4: #include <petsc/private/cpp/type_traits.hpp>

  6: #include <unordered_map>
  7: #include <algorithm> // std::find_if
  8: #include <cstring>   // std::memset

 10: const char *const PetscDeviceCopyModes[] = {"host_to_host", "device_to_host", "host_to_device", "device_to_device", "auto", "PetscDeviceCopyMode", "PETSC_DEVICE_COPY_", nullptr};
 11: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOH) == 0, "");
 12: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOH) == 1, "");
 13: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_HTOD) == 2, "");
 14: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_DTOD) == 3, "");
 15: static_assert(Petsc::util::integral_value(PETSC_DEVICE_COPY_AUTO) == 4, "");

 17: // ==========================================================================================
 18: // MemoryMap
 19: //
 20: // Since the pointers allocated via PetscDeviceAllocate_Private() may be device pointers we
 21: // cannot just store meta-data within the pointer itself (as we can't dereference them). So
 22: // instead we need to keep an extra map to keep track of them
 23: //
 24: // Each entry maps pointer -> {
 25: //   PetscMemType  - The memtype of the pointer
 26: //   PetscObjectId - A unique ID assigned at allocation or registratrion so auto-dep can
 27: //                   identify the pointer
 28: //   size          - The size (in bytes) of the allocation
 29: // }
 30: // ==========================================================================================

 32: // GCC implementation for std::hash<T*>. LLVM's libc++ is almost 2x slower because they do all
 33: // kinds of complicated murmur hashing, so we make sure to enforce GCC's version.
 34: struct PointerHash {
 35:   template <typename T>
 36:   PETSC_NODISCARD std::size_t operator()(const T *ptr) const noexcept
 37:   {
 38:     return reinterpret_cast<std::size_t>(ptr);
 39:   }
 40: };

 42: class MemoryMap : public Petsc::RegisterFinalizeable<MemoryMap> {
 43: public:
 44:   struct PointerAttributes {
 45:     PetscMemType  mtype{}; // memtype of allocation
 46:     PetscObjectId id{};    // id of allocation
 47:     std::size_t   size{};  // size of allocation (bytes)

 49:     // even though this is a POD and can be aggregate initialized, the STL uses () constructors
 50:     // in unordered_map and so we need to provide a trivial contructor...
 51:     constexpr PointerAttributes(PetscMemType, PetscObjectId, std::size_t) noexcept;
 52:     constexpr PointerAttributes() noexcept                                              = default;
 53:     constexpr PointerAttributes(const PointerAttributes &) noexcept                     = default;
 54:     PETSC_CONSTEXPR_14 PointerAttributes &operator=(const PointerAttributes &) noexcept = default;
 55:     constexpr PointerAttributes(PointerAttributes &&) noexcept                          = default;
 56:     PETSC_CONSTEXPR_14 PointerAttributes &operator=(PointerAttributes &&) noexcept      = default;

 58:     bool operator==(const PointerAttributes &) const noexcept;

 60:     PETSC_NODISCARD bool contains(const void *, const void *) const noexcept;
 61:   };

 63:   using map_type = std::unordered_map<void *, PointerAttributes, PointerHash>;

 65:   map_type map;

 67:   // return the iterator of the allocation containing ptr, or map.cend() if not found
 68:   PETSC_NODISCARD map_type::const_iterator search_for(const void *, bool = false) const noexcept;

 70: private:
 71:   friend class Petsc::RegisterFinalizeable<MemoryMap>;
 72:   PETSC_NODISCARD PetscErrorCode register_finalize_() noexcept;
 73:   PETSC_NODISCARD PetscErrorCode finalize_() noexcept;
 74: };

 76: // ==========================================================================================
 77: // PointerAttributes
 78: // ==========================================================================================

 80: constexpr MemoryMap::PointerAttributes::PointerAttributes(PetscMemType mtype_, PetscObjectId id_, std::size_t size_) noexcept : mtype(mtype_), id(id_), size(size_) { }

 82: bool MemoryMap::PointerAttributes::operator==(const PointerAttributes &other) const noexcept
 83: {
 84:   return mtype == other.mtype && id == other.id && size == other.size;
 85: }

 87: bool MemoryMap::PointerAttributes::contains(const void *ptr_begin, const void *ptr) const noexcept
 88: {
 89:   return (ptr >= ptr_begin) && (ptr < (static_cast<const char *>(ptr_begin) + size));
 90: }

 92: // ==========================================================================================
 93: // Memory map - Private API
 94: // ==========================================================================================

 96: PetscErrorCode MemoryMap::register_finalize_() noexcept
 97: {
 98:   // Preallocate, this does give a modest performance bump since unordered_map is so __dog__
 99:   // slow if it needs to rehash. Experiments show that users tend not to have more than 5 or
100:   // so concurrently live pointers lying around. 10 at most.
101:   map.reserve(16);
102:   return 0;
103: }

105: PetscErrorCode MemoryMap::finalize_() noexcept
106: {
107:   PetscInfo(nullptr, "Finalizing memory map\n");
108:   map = map_type{};
109:   return 0;
110: }

112: // ==========================================================================================
113: // Memory map - Public API
114: // ==========================================================================================

116: /*
117:   MemoryMap::search_for - retrieve an iterator to the key-value pair for a pointer in the map

119:   Input Parameters:
120: + ptr       - pointer to search for
121: - must_find - true if an error is raised if the pointer is not found (default: false)

123:   Notes:
124:   Accounts for sub-regions, i.e. if ptr is contained within another pointers region, it returns
125:   the iterator to the super-pointers key-value pair.

127:   If ptr is not found and must_find is false returns map.end(), otherwise raises an error
128: */
129: MemoryMap::map_type::const_iterator MemoryMap::search_for(const void *ptr, bool must_find) const noexcept
130: {
131:   const auto end = map.end();
132:   auto       it  = map.find(const_cast<map_type::key_type>(ptr));

134:   // ptr was found, and points to an entire block
135:   if (it != end) return it;
136:   // wasn't found, but maybe its part of a block. have to search every block for it
137:   // clang-format off
138:   it = std::find_if(map.begin(), end, [ptr](const map_type::const_iterator::value_type &map_it) {
139:     return map_it.second.contains(map_it.first, ptr);
140:   });
142:   return it;
143:   // clang-format on
144: }

146: static MemoryMap memory_map;

148: // ==========================================================================================
149: // Utility functions
150: // ==========================================================================================

152: static PetscErrorCode PetscDeviceCheckCapable_Private(PetscDeviceContext dctx, bool cond, const char descr[])
153: {
155:   return 0;
156: }

158: // A helper utility, since register is called from PetscDeviceRegisterMemory() and
159: // PetscDevicAllocate(). The latter also needs the generated id, so instead of making it search
160: // the map again we just return it here
161: static PetscErrorCode PetscDeviceRegisterMemory_Private(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size, PetscObjectId *PETSC_RESTRICT id = nullptr)
162: {
163:   auto      &map = memory_map.map;
164:   const auto it  = memory_map.search_for(ptr);

166:   if (it == map.cend()) {
167:     // pointer was never registered with the map, insert it and bail
168:     const auto newid = PetscObjectNewId_Internal();

170:     if (PetscDefined(USE_DEBUG)) {
171:       const auto tmp = MemoryMap::PointerAttributes(mtype, newid, size);

173:       for (const auto &entry : map) {
174:         // REVIEW ME: maybe this should just be handled...
176:                    entry.first, PetscMemTypeToString(entry.second.mtype), entry.second.size);
177:       }
178:     }
179:     // clang-format off
180:     if (id) *id = newid;
181:     PetscCallCXX(map.emplace(
182:       std::piecewise_construct,
183:       std::forward_as_tuple(const_cast<MemoryMap::map_type::key_type>(ptr)),
184:       std::forward_as_tuple(mtype, newid, size)
185:     ));
186:     // clang-format on
187:     return 0;
188:   }
189:   if (PetscDefined(USE_DEBUG)) {
190:     const auto &old = it->second;

193:                PetscMemTypeToString(old.mtype), old.size, old.id, PetscMemTypeToString(mtype), size, old.id);
194:   }
195:   if (id) *id = it->second.id;
196:   return 0;
197: }

199: /*@C
200:   PetscDeviceRegisterMemory - Register a pointer for use with device-aware memory system

202:   Not Collective

204:   Input Parameters:
205: + ptr   - The pointer to register
206: . mtype - The `PetscMemType` of the pointer
207: - size  - The size (in bytes) of the memory region

209:   Notes:
210:   `ptr` need not point to the beginning of the memory range, however the user should register
211:   the

213:   It's OK to re-register the same `ptr` repeatedly (subsequent registrations do nothing)
214:   however the given `mtype` and `size` must match the original registration.

216:   `size` may be 0 (in which case this routine does nothing).

218:   Level: intermediate

220: .seealso: `PetscDeviceMalloc()`, `PetscDeviceArrayCopy()`, `PetscDeviceFree()`,
221: `PetscDeviceArrayZero()`
222: @*/
223: PetscErrorCode PetscDeviceRegisterMemory(const void *PETSC_RESTRICT ptr, PetscMemType mtype, std::size_t size)
224: {
226:   if (PetscUnlikely(!size)) return 0; // there is no point registering empty range
227:   PetscDeviceRegisterMemory_Private(ptr, mtype, size);
228:   return 0;
229: }

231: /*
232:   PetscDeviceAllocate_Private - Allocate device-aware memory

234:   Not Collective, Asynchronous, Auto-dependency aware

236:   Input Parameters:
237: + dctx      - The `PetscDeviceContext` used to allocate the memory
238: . clear     - Whether or not the memory should be zeroed
239: . mtype     - The type of memory to allocate
240: . n         - The amount (in bytes) to allocate
241: - alignment - The alignment requirement (in bytes) of the allocated pointer

243:   Output Parameter:
244: . ptr - The pointer to store the result in

246:   Notes:
247:   The user should prefer `PetscDeviceMalloc()` over this routine as it automatically computes
248:   the size of the allocation and alignment based on the size of the datatype.

250:   If the user is unsure about `alignment` -- or unable to compute it -- passing
251:   `PETSC_MEMALIGN` will always work, though the user should beware that this may be quite
252:   wasteful for very small allocations.

254:   Memory allocated with this function must be freed with `PetscDeviceFree()` (or
255:   `PetscDeviceDeallocate_Private()`).

257:   If `n` is zero, then `ptr` is set to `PETSC_NULLPTR`.

259:   This routine falls back to using `PetscMalloc1()` or `PetscCalloc1()` (depending on the value
260:   of `clear`) if PETSc was not configured with device support. The user should note that
261:   `mtype` and `alignment` are ignored in this case, as these routines allocate only host memory
262:   aligned to `PETSC_MEMALIGN`.

264:   Note result stored `ptr` is immediately valid and the user may freely inspect or manipulate
265:   its value on function return, i.e.\:

267: .vb
268:   PetscInt *ptr;

270:   PetscDeviceAllocate_Private(dctx, PETSC_FALSE, PETSC_MEMTYPE_DEVICE, 20, alignof(PetscInt), (void**)&ptr);

272:   PetscInt *sub_ptr = ptr + 10; // OK, no need to synchronize

274:   ptr[0] = 10; // ERROR, directly accessing contents of ptr is undefined until synchronization
275: .ve

277:   DAG representation:
278: .vb
279:   time ->

281:   -> dctx - |= CALL =| -\- dctx -->
282:                          \- ptr ->
283: .ve

285:   Level: intermediate

287: .N ASYNC_API

289: .seealso: `PetscDeviceMalloc()`, `PetscDeviceFree()`, `PetscDeviceDeallocate_Private()`,
290: `PetscDeviceArrayCopy()`, `PetscDeviceArrayZero()`, `PetscMemType`
291: */
292: PetscErrorCode PetscDeviceAllocate_Private(PetscDeviceContext dctx, PetscBool clear, PetscMemType mtype, std::size_t n, std::size_t alignment, void **PETSC_RESTRICT ptr)
293: {
294:   PetscObjectId id = 0;

296:   if (PetscDefined(USE_DEBUG)) {
297:     const auto is_power_of_2 = [](std::size_t num) { return (num & (num - 1)) == 0; };

301:   }
303:   *ptr = nullptr;
304:   if (PetscUnlikely(!n)) return 0;
305:   memory_map.register_finalize();
306:   PetscDeviceContextGetOptionalNullContext_Internal(&dctx);

308:   // get our pointer here
309:   if (dctx->ops->memalloc) {
310:     PetscUseTypeMethod(dctx, memalloc, clear, mtype, n, alignment, ptr);
311:   } else {
312:     PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "allocating");
313:     PetscMallocA(1, clear, __LINE__, PETSC_FUNCTION_NAME, __FILE__, n, ptr);
314:   }
315:   PetscDeviceRegisterMemory_Private(*ptr, mtype, n, &id);
316:   // Note this is a "write" so that the next dctx to try and read from the pointer has to wait
317:   // for the allocation to be ready
318:   PetscDeviceContextMarkIntentFromID(dctx, id, PETSC_MEMORY_ACCESS_WRITE, "memory allocation");
319:   return 0;
320: }

322: /*
323:   PetscDeviceDeallocate_Private - Free device-aware memory

325:   Not Collective, Asynchronous, Auto-dependency aware

327:   Input Parameters:
328: + dctx  - The `PetscDeviceContext` used to free the memory
329: - ptr   - The pointer to free

331:   Notes:
332:   `ptr` must have been allocated using any of `PetscDeviceMalloc()`, `PetscDeviceCalloc()` or
333:   `PetscDeviceAllocate_Private()`, or registered with the system via `PetscDeviceRegisterMemory()`.

335:   The user should prefer `PetscDeviceFree()` over this routine as it automatically sets `ptr`
336:   to `PETSC_NULLPTR` on successful deallocation.

338:   `ptr` may be `NULL`.

340:   This routine falls back to using `PetscFree()` if PETSc was not configured with device
341:   support. The user should note that `PetscFree()` frees only host memory.

343:   DAG representation:
344: .vb
345:   time ->

347:   -> dctx -/- |= CALL =| - dctx ->
348:   -> ptr -/
349: .ve

351:   Level: intermediate

353: .N ASYNC_API

355: .seealso: `PetscDeviceFree()`, `PetscDeviceAllocate_Private()`
356: */
357: PetscErrorCode PetscDeviceDeallocate_Private(PetscDeviceContext dctx, void *PETSC_RESTRICT ptr)
358: {
359:   if (ptr) {
360:     auto      &map      = memory_map.map;
361:     const auto found_it = map.find(const_cast<MemoryMap::map_type::key_type>(ptr));

363:     if (PetscUnlikelyDebug(found_it == map.end())) {
364:       // OK this is a bad pointer, now determine why
365:       const auto it = memory_map.search_for(ptr);

367:       // if it is map.cend() then no allocation owns it, meaning it was not allocated by us!
369:       // if we are here then we did allocate it but the user has tried to do something along
370:       // the lines of:
371:       //
372:       // allocate(&ptr, size);
373:       // deallocate(ptr+5);
374:       //
375:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Attempting to deallocate pointer %p which is a suballocation of %p (memtype %s, id %" PetscInt64_FMT ", size %zu bytes)", ptr, it->first, PetscMemTypeToString(it->second.mtype), it->second.id,
376:               it->second.size);
377:     }

379:     PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
380:     // mark intent BEFORE we free, note we mark as write so that we are made to wait on any
381:     // outstanding reads (don't want to kill the pointer before they are done)
382:     PetscDeviceContextMarkIntentFromID(dctx, found_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory deallocation");
383:     // do free
384:     if (dctx->ops->memfree) {
385:       PetscUseTypeMethod(dctx, memfree, found_it->second.mtype, (void **)&ptr);
386:     } else {
387:       PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(found_it->second.mtype), "freeing");
388:     }
389:     // if ptr still exists, then the device context could not handle it
390:     if (ptr) PetscFree(ptr);
391:     map.erase(found_it);
392:   }
393:   return 0;
394: }

396: /*@C
397:   PetscDeviceMemcpy - Copy memory in a device-aware manner

399:   Not Collective, Asynchronous, Auto-dependency aware

401:   Input Parameters:
402: + dctx - The `PetscDeviceContext` used to copy the memory
403: . dest - The pointer to copy to
404: . src  - The pointer to copy from
405: - n    - The amount (in bytes) to copy

407:   Notes:
408:   Both `dest` and `src` must have been allocated by `PetscDeviceMalloc()` or
409:   `PetscDeviceCalloc()`.

411:   `src` and `dest` cannot overlap.

413:   If both `src` and `dest` are on the host this routine is fully synchronous.

415:   The user should prefer `PetscDeviceArrayCopy()` over this routine as it automatically
416:   computes the number of bytes to copy from the size of the pointer types.

418:   DAG representation:
419: .vb
420:   time ->

422:   -> dctx - |= CALL =| - dctx ->
423:   -> dest --------------------->
424:   -> src ---------------------->
425: .ve

427:   Level: intermediate

429: .N ASYNC_API

431: .seealso: `PetscDeviceArrayCopy()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
432: `PetscDeviceFree()`
433: @*/
434: PetscErrorCode PetscDeviceMemcpy(PetscDeviceContext dctx, void *PETSC_RESTRICT dest, const void *PETSC_RESTRICT src, std::size_t n)
435: {
436:   if (!n) return 0;
439:   if (dest == src) return 0;
440:   PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
441:   {
442:     const auto dest_it = memory_map.search_for(dest, true);
443:     const auto src_it  = memory_map.search_for(src, true);
444:     const auto mode    = PetscMemTypeToDeviceCopyMode(dest_it->second.mtype, src_it->second.mtype);

446:     PetscDeviceContextMarkIntentFromID(dctx, src_it->second.id, PETSC_MEMORY_ACCESS_READ, "memory copy (src)");
447:     PetscDeviceContextMarkIntentFromID(dctx, dest_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory copy (dest)");
448:     // perform the copy
449:     if (dctx->ops->memcopy) {
450:       PetscUseTypeMethod(dctx, memcopy, dest, src, n, mode);
451:       if (mode == PETSC_DEVICE_COPY_HTOD) {
452:         PetscLogCpuToGpu(n);
453:       } else if (mode == PETSC_DEVICE_COPY_DTOH) {
454:         PetscLogGpuToCpu(n);
455:       }
456:     } else {
457:       // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
458:       // (pinned) but being copied by a host dctx
459:       PetscDeviceCheckCapable_Private(dctx, mode == PETSC_DEVICE_COPY_HTOH, "copying");
460:       PetscMemcpy(dest, src, n);
461:     }
462:   }
463:   return 0;
464: }

466: /*@C
467:   PetscDeviceMemset - Memset device-aware memory

469:   Not Collective, Asynchronous, Auto-dependency aware

471:   Input Parameters:
472: + dctx  - The `PetscDeviceContext` used to memset the memory
473: . ptr   - The pointer to the memory
474: . v     - The value to set
475: - n     - The amount (in bytes) to set

477:   Notes:
478:   `ptr` must have been allocated by `PetscDeviceMalloc()` or `PetscDeviceCalloc()`.

480:   The user should prefer `PetscDeviceArrayZero()` over this routine as it automatically
481:   computes the number of bytes to copy from the size of the pointer types, though they should
482:   note that it only zeros memory.

484:   This routine is analogous to `memset()`. That is, this routine copies the value
485:   `static_cast<unsigned char>(v)` into each of the first count characters of the object pointed
486:   to by `dest`.

488:   If `dest` is on device, this routine is asynchronous.

490:   DAG representation:
491: .vb
492:   time ->

494:   -> dctx - |= CALL =| - dctx ->
495:   -> dest --------------------->
496: .ve

498:   Level: intermediate

500: .N ASYNC_API

502: .seealso: `PetscDeviceArrayZero()`, `PetscDeviceMalloc()`, `PetscDeviceCalloc()`,
503: `PetscDeviceFree()`
504: @*/
505: PetscErrorCode PetscDeviceMemset(PetscDeviceContext dctx, void *ptr, PetscInt v, std::size_t n)
506: {
507:   if (PetscUnlikely(!n)) return 0;
509:   PetscDeviceContextGetOptionalNullContext_Internal(&dctx);
510:   {
511:     const auto ptr_it = memory_map.search_for(ptr, true);
512:     const auto mtype  = ptr_it->second.mtype;

514:     PetscDeviceContextMarkIntentFromID(dctx, ptr_it->second.id, PETSC_MEMORY_ACCESS_WRITE, "memory set");
515:     if (dctx->ops->memset) {
516:       PetscUseTypeMethod(dctx, memset, mtype, ptr, v, n);
517:     } else {
518:       // REVIEW ME: we might potentially need to sync here if the memory is device-allocated
519:       // (pinned) but being memset by a host dctx
520:       PetscDeviceCheckCapable_Private(dctx, PetscMemTypeHost(mtype), "memsetting");
521:       std::memset(ptr, static_cast<int>(v), n);
522:     }
523:   }
524:   return 0;
525: }