1407 lines
54 KiB
C
1407 lines
54 KiB
C
|
//===- PluginInterface.h - Target independent plugin device interface -----===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
|
||
|
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
|
||
|
|
||
|
#include <cstddef>
|
||
|
#include <cstdint>
|
||
|
#include <deque>
|
||
|
#include <list>
|
||
|
#include <map>
|
||
|
#include <shared_mutex>
|
||
|
#include <vector>
|
||
|
|
||
|
#include "Shared/Debug.h"
|
||
|
#include "Shared/Environment.h"
|
||
|
#include "Shared/EnvironmentVar.h"
|
||
|
#include "Shared/Requirements.h"
|
||
|
#include "Shared/Utils.h"
|
||
|
|
||
|
#include "GlobalHandler.h"
|
||
|
#include "JIT.h"
|
||
|
#include "MemoryManager.h"
|
||
|
#include "RPC.h"
|
||
|
#include "omptarget.h"
|
||
|
|
||
|
#ifdef OMPT_SUPPORT
|
||
|
#include "omp-tools.h"
|
||
|
#endif
|
||
|
|
||
|
#include "llvm/ADT/SmallVector.h"
|
||
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
||
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||
|
#include "llvm/Support/Allocator.h"
|
||
|
#include "llvm/Support/Error.h"
|
||
|
#include "llvm/Support/ErrorHandling.h"
|
||
|
#include "llvm/Support/MemoryBufferRef.h"
|
||
|
#include "llvm/Support/raw_ostream.h"
|
||
|
#include "llvm/TargetParser/Triple.h"
|
||
|
|
||
|
namespace llvm {
|
||
|
namespace omp {
|
||
|
namespace target {
|
||
|
|
||
|
namespace plugin {
|
||
|
|
||
|
struct GenericPluginTy;
|
||
|
struct GenericKernelTy;
|
||
|
struct GenericDeviceTy;
|
||
|
|
||
|
/// Class that wraps the __tgt_async_info to simply its usage. In case the
|
||
|
/// object is constructed without a valid __tgt_async_info, the object will use
|
||
|
/// an internal one and will synchronize the current thread with the pending
|
||
|
/// operations when calling AsyncInfoWrapperTy::finalize(). This latter function
|
||
|
/// must be called before destroying the wrapper object.
|
||
|
struct AsyncInfoWrapperTy {
|
||
|
AsyncInfoWrapperTy(GenericDeviceTy &Device, __tgt_async_info *AsyncInfoPtr);
|
||
|
|
||
|
~AsyncInfoWrapperTy() {
|
||
|
assert(!AsyncInfoPtr && "AsyncInfoWrapperTy not finalized");
|
||
|
}
|
||
|
|
||
|
/// Get the raw __tgt_async_info pointer.
|
||
|
operator __tgt_async_info *() const { return AsyncInfoPtr; }
|
||
|
|
||
|
/// Indicate whether there is queue.
|
||
|
bool hasQueue() const { return (AsyncInfoPtr->Queue != nullptr); }
|
||
|
|
||
|
/// Get the queue.
|
||
|
template <typename Ty> Ty getQueueAs() {
|
||
|
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
|
||
|
"Queue is not of the same size as target type");
|
||
|
return static_cast<Ty>(AsyncInfoPtr->Queue);
|
||
|
}
|
||
|
|
||
|
/// Set the queue.
|
||
|
template <typename Ty> void setQueueAs(Ty Queue) {
|
||
|
static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
|
||
|
"Queue is not of the same size as target type");
|
||
|
assert(!AsyncInfoPtr->Queue && "Overwriting queue");
|
||
|
AsyncInfoPtr->Queue = Queue;
|
||
|
}
|
||
|
|
||
|
/// Synchronize with the __tgt_async_info's pending operations if it's the
|
||
|
/// internal async info. The error associated to the aysnchronous operations
|
||
|
/// issued in this queue must be provided in \p Err. This function will update
|
||
|
/// the error parameter with the result of the synchronization if it was
|
||
|
/// actually executed. This function must be called before destroying the
|
||
|
/// object and only once.
|
||
|
void finalize(Error &Err);
|
||
|
|
||
|
/// Register \p Ptr as an associated alloction that is freed after
|
||
|
/// finalization.
|
||
|
void freeAllocationAfterSynchronization(void *Ptr) {
|
||
|
AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
GenericDeviceTy &Device;
|
||
|
__tgt_async_info LocalAsyncInfo;
|
||
|
__tgt_async_info *AsyncInfoPtr;
|
||
|
};
|
||
|
|
||
|
/// The information level represents the level of a key-value property in the
|
||
|
/// info tree print (i.e. indentation). The first level should be the default.
|
||
|
enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
|
||
|
|
||
|
/// Class for storing device information and later be printed. An object of this
|
||
|
/// type acts as a queue of key-value properties. Each property has a key, a
|
||
|
/// a value, and an optional unit for the value. For printing purposes, the
|
||
|
/// information can be classified into several levels. These levels are useful
|
||
|
/// for defining sections and subsections. Thus, each key-value property also
|
||
|
/// has an additional field indicating to which level belongs to. Notice that
|
||
|
/// we use the level to determine the indentation of the key-value property at
|
||
|
/// printing time. See the enum InfoLevelKind for the list of accepted levels.
|
||
|
class InfoQueueTy {
|
||
|
struct InfoQueueEntryTy {
|
||
|
std::string Key;
|
||
|
std::string Value;
|
||
|
std::string Units;
|
||
|
uint64_t Level;
|
||
|
};
|
||
|
|
||
|
std::deque<InfoQueueEntryTy> Queue;
|
||
|
|
||
|
public:
|
||
|
/// Add a new info entry to the queue. The entry requires at least a key
|
||
|
/// string in \p Key. The value in \p Value is optional and can be any type
|
||
|
/// that is representable as a string. The units in \p Units is optional and
|
||
|
/// must be a string. The info level is a template parameter that defaults to
|
||
|
/// the first level (top level).
|
||
|
template <InfoLevelKind L = InfoLevel1, typename T = std::string>
|
||
|
void add(const std::string &Key, T Value = T(),
|
||
|
const std::string &Units = std::string()) {
|
||
|
assert(!Key.empty() && "Invalid info key");
|
||
|
|
||
|
// Convert the value to a string depending on its type.
|
||
|
if constexpr (std::is_same_v<T, bool>)
|
||
|
Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
|
||
|
else if constexpr (std::is_arithmetic_v<T>)
|
||
|
Queue.push_back({Key, std::to_string(Value), Units, L});
|
||
|
else
|
||
|
Queue.push_back({Key, Value, Units, L});
|
||
|
}
|
||
|
|
||
|
/// Print all info entries added to the queue.
|
||
|
void print() const {
|
||
|
// We print four spances for each level.
|
||
|
constexpr uint64_t IndentSize = 4;
|
||
|
|
||
|
// Find the maximum key length (level + key) to compute the individual
|
||
|
// indentation of each entry.
|
||
|
uint64_t MaxKeySize = 0;
|
||
|
for (const auto &Entry : Queue) {
|
||
|
uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
|
||
|
if (KeySize > MaxKeySize)
|
||
|
MaxKeySize = KeySize;
|
||
|
}
|
||
|
|
||
|
// Print all info entries.
|
||
|
for (const auto &Entry : Queue) {
|
||
|
// Compute the indentations for the current entry.
|
||
|
uint64_t KeyIndentSize = Entry.Level * IndentSize;
|
||
|
uint64_t ValIndentSize =
|
||
|
MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
|
||
|
|
||
|
llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
|
||
|
<< std::string(ValIndentSize, ' ') << Entry.Value
|
||
|
<< (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
|
||
|
}
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/// Class wrapping a __tgt_device_image and its offload entry table on a
|
||
|
/// specific device. This class is responsible for storing and managing
|
||
|
/// the offload entries for an image on a device.
|
||
|
class DeviceImageTy {
|
||
|
/// Image identifier within the corresponding device. Notice that this id is
|
||
|
/// not unique between different device; they may overlap.
|
||
|
int32_t ImageId;
|
||
|
|
||
|
/// The pointer to the raw __tgt_device_image.
|
||
|
const __tgt_device_image *TgtImage;
|
||
|
const __tgt_device_image *TgtImageBitcode;
|
||
|
|
||
|
/// Reference to the device this image is loaded on.
|
||
|
GenericDeviceTy &Device;
|
||
|
|
||
|
/// If this image has any global destructors that much be called.
|
||
|
/// FIXME: This is only required because we currently have no invariants
|
||
|
/// towards the lifetime of the underlying image. We should either copy
|
||
|
/// the image into memory locally or erase the pointers after init.
|
||
|
bool PendingGlobalDtors;
|
||
|
|
||
|
public:
|
||
|
DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
|
||
|
const __tgt_device_image *Image)
|
||
|
: ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
|
||
|
PendingGlobalDtors(false) {
|
||
|
assert(TgtImage && "Invalid target image");
|
||
|
}
|
||
|
|
||
|
/// Get the image identifier within the device.
|
||
|
int32_t getId() const { return ImageId; }
|
||
|
|
||
|
/// Get the device that this image is loaded onto.
|
||
|
GenericDeviceTy &getDevice() const { return Device; }
|
||
|
|
||
|
/// Get the pointer to the raw __tgt_device_image.
|
||
|
const __tgt_device_image *getTgtImage() const { return TgtImage; }
|
||
|
|
||
|
void setTgtImageBitcode(const __tgt_device_image *TgtImageBitcode) {
|
||
|
this->TgtImageBitcode = TgtImageBitcode;
|
||
|
}
|
||
|
|
||
|
const __tgt_device_image *getTgtImageBitcode() const {
|
||
|
return TgtImageBitcode;
|
||
|
}
|
||
|
|
||
|
/// Get the image starting address.
|
||
|
void *getStart() const { return TgtImage->ImageStart; }
|
||
|
|
||
|
/// Get the image size.
|
||
|
size_t getSize() const {
|
||
|
return getPtrDiff(TgtImage->ImageEnd, TgtImage->ImageStart);
|
||
|
}
|
||
|
|
||
|
/// Get a memory buffer reference to the whole image.
|
||
|
MemoryBufferRef getMemoryBuffer() const {
|
||
|
return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
|
||
|
"Image");
|
||
|
}
|
||
|
/// Accessors to the boolean value
|
||
|
bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
|
||
|
bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
|
||
|
};
|
||
|
|
||
|
/// Class implementing common functionalities of offload kernels. Each plugin
|
||
|
/// should define the specific kernel class, derive from this generic one, and
|
||
|
/// implement the necessary virtual function members.
|
||
|
struct GenericKernelTy {
|
||
|
/// Construct a kernel with a name and a execution mode.
|
||
|
GenericKernelTy(const char *Name)
|
||
|
: Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
|
||
|
|
||
|
virtual ~GenericKernelTy() {}
|
||
|
|
||
|
/// Initialize the kernel object from a specific device.
|
||
|
Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
|
||
|
virtual Error initImpl(GenericDeviceTy &GenericDevice,
|
||
|
DeviceImageTy &Image) = 0;
|
||
|
|
||
|
/// Launch the kernel on the specific device. The device must be the same
|
||
|
/// one used to initialize the kernel.
|
||
|
Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||
|
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
||
|
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
|
||
|
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
|
||
|
void *Args,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
|
||
|
|
||
|
/// Get the kernel name.
|
||
|
const char *getName() const { return Name; }
|
||
|
|
||
|
/// Return true if this kernel is a constructor or destructor.
|
||
|
bool isCtorOrDtor() const {
|
||
|
// TODO: This is not a great solution and should be revisited.
|
||
|
return StringRef(Name).ends_with("tor");
|
||
|
}
|
||
|
|
||
|
/// Get the kernel image.
|
||
|
DeviceImageTy &getImage() const {
|
||
|
assert(ImagePtr && "Kernel is not initialized!");
|
||
|
return *ImagePtr;
|
||
|
}
|
||
|
|
||
|
/// Return the kernel environment object for kernel \p Name.
|
||
|
const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
|
||
|
return KernelEnvironment;
|
||
|
}
|
||
|
|
||
|
/// Return a device pointer to a new kernel launch environment.
|
||
|
Expected<KernelLaunchEnvironmentTy *>
|
||
|
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
|
||
|
AsyncInfoWrapperTy &AsyncInfo) const;
|
||
|
|
||
|
/// Indicate whether an execution mode is valid.
|
||
|
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
|
||
|
switch (ExecutionMode) {
|
||
|
case OMP_TGT_EXEC_MODE_SPMD:
|
||
|
case OMP_TGT_EXEC_MODE_GENERIC:
|
||
|
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
/// Get the execution mode name of the kernel.
|
||
|
const char *getExecutionModeName() const {
|
||
|
switch (KernelEnvironment.Configuration.ExecMode) {
|
||
|
case OMP_TGT_EXEC_MODE_SPMD:
|
||
|
return "SPMD";
|
||
|
case OMP_TGT_EXEC_MODE_GENERIC:
|
||
|
return "Generic";
|
||
|
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
|
||
|
return "Generic-SPMD";
|
||
|
}
|
||
|
llvm_unreachable("Unknown execution mode!");
|
||
|
}
|
||
|
|
||
|
/// Prints generic kernel launch information.
|
||
|
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
|
||
|
KernelArgsTy &KernelArgs, uint32_t NumThreads,
|
||
|
uint64_t NumBlocks) const;
|
||
|
|
||
|
/// Prints plugin-specific kernel launch information after generic kernel
|
||
|
/// launch information
|
||
|
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
||
|
KernelArgsTy &KernelArgs,
|
||
|
uint32_t NumThreads,
|
||
|
uint64_t NumBlocks) const;
|
||
|
|
||
|
private:
|
||
|
/// Prepare the arguments before launching the kernel.
|
||
|
void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||
|
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
|
||
|
llvm::SmallVectorImpl<void *> &Args,
|
||
|
llvm::SmallVectorImpl<void *> &Ptrs,
|
||
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
|
||
|
|
||
|
/// Get the number of threads and blocks for the kernel based on the
|
||
|
/// user-defined threads and block clauses.
|
||
|
uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
|
||
|
uint32_t ThreadLimitClause[3]) const;
|
||
|
|
||
|
/// The number of threads \p NumThreads can be adjusted by this method.
|
||
|
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
|
||
|
/// thread_limit clause.
|
||
|
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
|
||
|
uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
|
||
|
uint32_t &NumThreads, bool IsNumThreadsFromUser) const;
|
||
|
|
||
|
/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
|
||
|
bool isGenericSPMDMode() const {
|
||
|
return KernelEnvironment.Configuration.ExecMode ==
|
||
|
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
|
||
|
}
|
||
|
bool isGenericMode() const {
|
||
|
return KernelEnvironment.Configuration.ExecMode ==
|
||
|
OMP_TGT_EXEC_MODE_GENERIC;
|
||
|
}
|
||
|
bool isSPMDMode() const {
|
||
|
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
|
||
|
}
|
||
|
|
||
|
/// The kernel name.
|
||
|
const char *Name;
|
||
|
|
||
|
/// The image that contains this kernel.
|
||
|
DeviceImageTy *ImagePtr = nullptr;
|
||
|
|
||
|
protected:
|
||
|
/// The preferred number of threads to run the kernel.
|
||
|
uint32_t PreferredNumThreads;
|
||
|
|
||
|
/// The maximum number of threads which the kernel could leverage.
|
||
|
uint32_t MaxNumThreads;
|
||
|
|
||
|
/// The kernel environment, including execution flags.
|
||
|
KernelEnvironmentTy KernelEnvironment;
|
||
|
|
||
|
/// The prototype kernel launch environment.
|
||
|
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
|
||
|
|
||
|
/// If the kernel is a bare kernel.
|
||
|
bool IsBareKernel = false;
|
||
|
};
|
||
|
|
||
|
/// Class representing a map of host pinned allocations. We track these pinned
|
||
|
/// allocations, so memory tranfers invloving these buffers can be optimized.
|
||
|
class PinnedAllocationMapTy {
|
||
|
|
||
|
/// Struct representing a map entry.
|
||
|
struct EntryTy {
|
||
|
/// The host pointer of the pinned allocation.
|
||
|
void *HstPtr;
|
||
|
|
||
|
/// The pointer that devices' driver should use to transfer data from/to the
|
||
|
/// pinned allocation. In most plugins, this pointer will be the same as the
|
||
|
/// host pointer above.
|
||
|
void *DevAccessiblePtr;
|
||
|
|
||
|
/// The size of the pinned allocation.
|
||
|
size_t Size;
|
||
|
|
||
|
/// Indicate whether the allocation was locked from outside the plugin, for
|
||
|
/// instance, from the application. The externally locked allocations are
|
||
|
/// not unlocked by the plugin when unregistering the last user.
|
||
|
bool ExternallyLocked;
|
||
|
|
||
|
/// The number of references to the pinned allocation. The allocation should
|
||
|
/// remain pinned and registered to the map until the number of references
|
||
|
/// becomes zero.
|
||
|
mutable size_t References;
|
||
|
|
||
|
/// Create an entry with the host and device acessible pointers, the buffer
|
||
|
/// size, and a boolean indicating whether the buffer was locked externally.
|
||
|
EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
|
||
|
bool ExternallyLocked)
|
||
|
: HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size),
|
||
|
ExternallyLocked(ExternallyLocked), References(1) {}
|
||
|
|
||
|
/// Utility constructor used for std::set searches.
|
||
|
EntryTy(void *HstPtr)
|
||
|
: HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0),
|
||
|
ExternallyLocked(false), References(0) {}
|
||
|
};
|
||
|
|
||
|
/// Comparator of mep entries. Use the host pointer to enforce an order
|
||
|
/// between entries.
|
||
|
struct EntryCmpTy {
|
||
|
bool operator()(const EntryTy &Left, const EntryTy &Right) const {
|
||
|
return Left.HstPtr < Right.HstPtr;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
typedef std::set<EntryTy, EntryCmpTy> PinnedAllocSetTy;
|
||
|
|
||
|
/// The map of host pinned allocations.
|
||
|
PinnedAllocSetTy Allocs;
|
||
|
|
||
|
/// The mutex to protect accesses to the map.
|
||
|
mutable std::shared_mutex Mutex;
|
||
|
|
||
|
/// Reference to the corresponding device.
|
||
|
GenericDeviceTy &Device;
|
||
|
|
||
|
/// Indicate whether mapped host buffers should be locked automatically.
|
||
|
bool LockMappedBuffers;
|
||
|
|
||
|
/// Indicate whether failures when locking mapped buffers should be ingored.
|
||
|
bool IgnoreLockMappedFailures;
|
||
|
|
||
|
/// Find an allocation that intersects with \p HstPtr pointer. Assume the
|
||
|
/// map's mutex is acquired.
|
||
|
const EntryTy *findIntersecting(const void *HstPtr) const {
|
||
|
if (Allocs.empty())
|
||
|
return nullptr;
|
||
|
|
||
|
// Search the first allocation with starting address that is not less than
|
||
|
// the buffer address.
|
||
|
auto It = Allocs.lower_bound({const_cast<void *>(HstPtr)});
|
||
|
|
||
|
// Direct match of starting addresses.
|
||
|
if (It != Allocs.end() && It->HstPtr == HstPtr)
|
||
|
return &(*It);
|
||
|
|
||
|
// Not direct match but may be a previous pinned allocation in the map which
|
||
|
// contains the buffer. Return false if there is no such a previous
|
||
|
// allocation.
|
||
|
if (It == Allocs.begin())
|
||
|
return nullptr;
|
||
|
|
||
|
// Move to the previous pinned allocation.
|
||
|
--It;
|
||
|
|
||
|
// The buffer is not contained in the pinned allocation.
|
||
|
if (advanceVoidPtr(It->HstPtr, It->Size) > HstPtr)
|
||
|
return &(*It);
|
||
|
|
||
|
// None found.
|
||
|
return nullptr;
|
||
|
}
|
||
|
|
||
|
/// Insert an entry to the map representing a locked buffer. The number of
|
||
|
/// references is set to one.
|
||
|
Error insertEntry(void *HstPtr, void *DevAccessiblePtr, size_t Size,
|
||
|
bool ExternallyLocked = false);
|
||
|
|
||
|
/// Erase an existing entry from the map.
|
||
|
Error eraseEntry(const EntryTy &Entry);
|
||
|
|
||
|
/// Register a new user into an entry that represents a locked buffer. Check
|
||
|
/// also that the registered buffer with \p HstPtr address and \p Size is
|
||
|
/// actually contained into the entry.
|
||
|
Error registerEntryUse(const EntryTy &Entry, void *HstPtr, size_t Size);
|
||
|
|
||
|
/// Unregister a user from the entry and return whether it is the last user.
|
||
|
/// If it is the last user, the entry will have to be removed from the map
|
||
|
/// and unlock the entry's host buffer (if necessary).
|
||
|
Expected<bool> unregisterEntryUse(const EntryTy &Entry);
|
||
|
|
||
|
/// Indicate whether the first range A fully contains the second range B.
|
||
|
static bool contains(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
|
||
|
void *EndA = advanceVoidPtr(PtrA, SizeA);
|
||
|
void *EndB = advanceVoidPtr(PtrB, SizeB);
|
||
|
return (PtrB >= PtrA && EndB <= EndA);
|
||
|
}
|
||
|
|
||
|
/// Indicate whether the first range A intersects with the second range B.
|
||
|
static bool intersects(void *PtrA, size_t SizeA, void *PtrB, size_t SizeB) {
|
||
|
void *EndA = advanceVoidPtr(PtrA, SizeA);
|
||
|
void *EndB = advanceVoidPtr(PtrB, SizeB);
|
||
|
return (PtrA < EndB && PtrB < EndA);
|
||
|
}
|
||
|
|
||
|
public:
|
||
|
/// Create the map of pinned allocations corresponding to a specific device.
|
||
|
PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {
|
||
|
|
||
|
// Envar that indicates whether mapped host buffers should be locked
|
||
|
// automatically. The possible values are boolean (on/off) and a special:
|
||
|
// off: Mapped host buffers are not locked.
|
||
|
// on: Mapped host buffers are locked in a best-effort approach.
|
||
|
// Failure to lock the buffers are silent.
|
||
|
// mandatory: Mapped host buffers are always locked and failures to lock
|
||
|
// a buffer results in a fatal error.
|
||
|
StringEnvar OMPX_LockMappedBuffers("LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS",
|
||
|
"off");
|
||
|
|
||
|
bool Enabled;
|
||
|
if (StringParser::parse(OMPX_LockMappedBuffers.get().data(), Enabled)) {
|
||
|
// Parsed as a boolean value. Enable the feature if necessary.
|
||
|
LockMappedBuffers = Enabled;
|
||
|
IgnoreLockMappedFailures = true;
|
||
|
} else if (OMPX_LockMappedBuffers.get() == "mandatory") {
|
||
|
// Enable the feature and failures are fatal.
|
||
|
LockMappedBuffers = true;
|
||
|
IgnoreLockMappedFailures = false;
|
||
|
} else {
|
||
|
// Disable by default.
|
||
|
DP("Invalid value LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS=%s\n",
|
||
|
OMPX_LockMappedBuffers.get().data());
|
||
|
LockMappedBuffers = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// Register a buffer that was recently allocated as a locked host buffer.
|
||
|
/// None of the already registered pinned allocations should intersect with
|
||
|
/// this new one. The registration requires the host pointer in \p HstPtr,
|
||
|
/// the device accessible pointer in \p DevAccessiblePtr, and the size of the
|
||
|
/// allocation in \p Size. The allocation must be unregistered using the
|
||
|
/// unregisterHostBuffer function.
|
||
|
Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size);
|
||
|
|
||
|
/// Unregister a host pinned allocation passing the host pointer which was
|
||
|
/// previously registered using the registerHostBuffer function. When calling
|
||
|
/// this function, the pinned allocation cannot have any other user and will
|
||
|
/// not be unlocked by this function.
|
||
|
Error unregisterHostBuffer(void *HstPtr);
|
||
|
|
||
|
/// Lock the host buffer at \p HstPtr or register a new user if it intersects
|
||
|
/// with an already existing one. A partial overlapping with extension is not
|
||
|
/// allowed. The function returns the device accessible pointer of the pinned
|
||
|
/// buffer. The buffer must be unlocked using the unlockHostBuffer function.
|
||
|
Expected<void *> lockHostBuffer(void *HstPtr, size_t Size);
|
||
|
|
||
|
/// Unlock the host buffer at \p HstPtr or unregister a user if other users
|
||
|
/// are still using the pinned allocation. If this was the last user, the
|
||
|
/// pinned allocation is removed from the map and the memory is unlocked.
|
||
|
Error unlockHostBuffer(void *HstPtr);
|
||
|
|
||
|
/// Lock or register a host buffer that was recently mapped by libomptarget.
|
||
|
/// This behavior is applied if LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS is
|
||
|
/// enabled. Even if not enabled, externally locked buffers are registered
|
||
|
/// in order to optimize their transfers.
|
||
|
Error lockMappedHostBuffer(void *HstPtr, size_t Size);
|
||
|
|
||
|
/// Unlock or unregister a host buffer that was unmapped by libomptarget.
|
||
|
Error unlockUnmappedHostBuffer(void *HstPtr);
|
||
|
|
||
|
/// Return the device accessible pointer associated to the host pinned
|
||
|
/// allocation which the \p HstPtr belongs, if any. Return null in case the
|
||
|
/// \p HstPtr does not belong to any host pinned allocation. The device
|
||
|
/// accessible pointer is the one that devices should use for data transfers
|
||
|
/// that involve a host pinned buffer.
|
||
|
void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const {
|
||
|
std::shared_lock<std::shared_mutex> Lock(Mutex);
|
||
|
|
||
|
// Find the intersecting allocation if any.
|
||
|
const EntryTy *Entry = findIntersecting(HstPtr);
|
||
|
if (!Entry)
|
||
|
return nullptr;
|
||
|
|
||
|
return advanceVoidPtr(Entry->DevAccessiblePtr,
|
||
|
getPtrDiff(HstPtr, Entry->HstPtr));
|
||
|
}
|
||
|
|
||
|
/// Check whether a buffer belongs to a registered host pinned allocation.
|
||
|
bool isHostPinnedBuffer(const void *HstPtr) const {
|
||
|
std::shared_lock<std::shared_mutex> Lock(Mutex);
|
||
|
|
||
|
// Return whether there is an intersecting allocation.
|
||
|
return (findIntersecting(const_cast<void *>(HstPtr)) != nullptr);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/// Class implementing common functionalities of offload devices. Each plugin
|
||
|
/// should define the specific device class, derive from this generic one, and
|
||
|
/// implement the necessary virtual function members.
|
||
|
struct GenericDeviceTy : public DeviceAllocatorTy {
|
||
|
/// Construct a device with its device id within the plugin, the number of
|
||
|
/// devices in the plugin and the grid values for that kind of device.
|
||
|
GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
|
||
|
const llvm::omp::GV &GridValues);
|
||
|
|
||
|
/// Get the device identifier within the corresponding plugin. Notice that
|
||
|
/// this id is not unique between different plugins; they may overlap.
|
||
|
int32_t getDeviceId() const { return DeviceId; }
|
||
|
|
||
|
/// Set the context of the device if needed, before calling device-specific
|
||
|
/// functions. Plugins may implement this function as a no-op if not needed.
|
||
|
virtual Error setContext() = 0;
|
||
|
|
||
|
/// Initialize the device. After this call, the device should be already
|
||
|
/// working and ready to accept queries or modifications.
|
||
|
Error init(GenericPluginTy &Plugin);
|
||
|
virtual Error initImpl(GenericPluginTy &Plugin) = 0;
|
||
|
|
||
|
/// Deinitialize the device and free all its resources. After this call, the
|
||
|
/// device is no longer considered ready, so no queries or modifications are
|
||
|
/// allowed.
|
||
|
Error deinit(GenericPluginTy &Plugin);
|
||
|
virtual Error deinitImpl() = 0;
|
||
|
|
||
|
/// Load the binary image into the device and return the target table.
|
||
|
Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
|
||
|
const __tgt_device_image *TgtImage);
|
||
|
virtual Expected<DeviceImageTy *>
|
||
|
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
|
||
|
|
||
|
/// Setup the device environment if needed. Notice this setup may not be run
|
||
|
/// on some plugins. By default, it will be executed, but plugins can change
|
||
|
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
|
||
|
Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
|
||
|
|
||
|
/// Setup the global device memory pool, if the plugin requires one.
|
||
|
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
|
||
|
uint64_t PoolSize);
|
||
|
|
||
|
// Setup the RPC server for this device if needed. This may not run on some
|
||
|
// plugins like the CPU targets. By default, it will not be executed so it is
|
||
|
// up to the target to override this using the shouldSetupRPCServer function.
|
||
|
Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
|
||
|
|
||
|
/// Synchronize the current thread with the pending operations on the
|
||
|
/// __tgt_async_info structure.
|
||
|
Error synchronize(__tgt_async_info *AsyncInfo);
|
||
|
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
|
||
|
|
||
|
/// Invokes any global constructors on the device if present and is required
|
||
|
/// by the target.
|
||
|
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
|
||
|
DeviceImageTy &Image) {
|
||
|
return Error::success();
|
||
|
}
|
||
|
|
||
|
/// Invokes any global destructors on the device if present and is required
|
||
|
/// by the target.
|
||
|
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
|
||
|
DeviceImageTy &Image) {
|
||
|
return Error::success();
|
||
|
}
|
||
|
|
||
|
/// Query for the completion of the pending operations on the __tgt_async_info
|
||
|
/// structure in a non-blocking manner.
|
||
|
Error queryAsync(__tgt_async_info *AsyncInfo);
|
||
|
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
|
||
|
|
||
|
/// Check whether the architecture supports VA management
|
||
|
virtual bool supportVAManagement() const { return false; }
|
||
|
|
||
|
/// Get the total device memory size
|
||
|
virtual Error getDeviceMemorySize(uint64_t &DSize);
|
||
|
|
||
|
/// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
|
||
|
/// map it to \p VAddr. The obtained address is stored in \p Addr. At return
|
||
|
/// \p RSize contains the actual size which can be equal or larger than the
|
||
|
/// requested size.
|
||
|
virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
|
||
|
|
||
|
/// De-allocates device memory and unmaps the virtual address \p VAddr
|
||
|
virtual Error memoryVAUnMap(void *VAddr, size_t Size);
|
||
|
|
||
|
/// Allocate data on the device or involving the device.
|
||
|
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
|
||
|
|
||
|
/// Deallocate data from the device or involving the device.
|
||
|
Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
|
||
|
|
||
|
/// Pin host memory to optimize transfers and return the device accessible
|
||
|
/// pointer that devices should use for memory transfers involving the host
|
||
|
/// pinned allocation.
|
||
|
Expected<void *> dataLock(void *HstPtr, int64_t Size) {
|
||
|
return PinnedAllocs.lockHostBuffer(HstPtr, Size);
|
||
|
}
|
||
|
|
||
|
/// Unpin a host memory buffer that was previously pinned.
|
||
|
Error dataUnlock(void *HstPtr) {
|
||
|
return PinnedAllocs.unlockHostBuffer(HstPtr);
|
||
|
}
|
||
|
|
||
|
/// Lock the host buffer \p HstPtr with \p Size bytes with the vendor-specific
|
||
|
/// API and return the device accessible pointer.
|
||
|
virtual Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) = 0;
|
||
|
|
||
|
/// Unlock a previously locked host buffer starting at \p HstPtr.
|
||
|
virtual Error dataUnlockImpl(void *HstPtr) = 0;
|
||
|
|
||
|
/// Mark the host buffer with address \p HstPtr and \p Size bytes as a mapped
|
||
|
/// buffer. This means that libomptarget created a new mapping of that host
|
||
|
/// buffer (e.g., because a user OpenMP target map) and the buffer may be used
|
||
|
/// as source/destination of memory transfers. We can use this information to
|
||
|
/// lock the host buffer and optimize its memory transfers.
|
||
|
Error notifyDataMapped(void *HstPtr, int64_t Size) {
|
||
|
return PinnedAllocs.lockMappedHostBuffer(HstPtr, Size);
|
||
|
}
|
||
|
|
||
|
/// Mark the host buffer with address \p HstPtr as unmapped. This means that
|
||
|
/// libomptarget removed an existing mapping. If the plugin locked the buffer
|
||
|
/// in notifyDataMapped, this function should unlock it.
|
||
|
Error notifyDataUnmapped(void *HstPtr) {
|
||
|
return PinnedAllocs.unlockUnmappedHostBuffer(HstPtr);
|
||
|
}
|
||
|
|
||
|
/// Check whether the host buffer with address \p HstPtr is pinned by the
|
||
|
/// underlying vendor-specific runtime (if any). Retrieve the host pointer,
|
||
|
/// the device accessible pointer and the size of the original pinned buffer.
|
||
|
virtual Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
|
||
|
void *&BaseDevAccessiblePtr,
|
||
|
size_t &BaseSize) const = 0;
|
||
|
|
||
|
/// Submit data to the device (host to device transfer).
|
||
|
Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
|
||
|
__tgt_async_info *AsyncInfo);
|
||
|
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
||
|
|
||
|
/// Retrieve data from the device (device to host transfer).
|
||
|
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
|
||
|
__tgt_async_info *AsyncInfo);
|
||
|
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
||
|
|
||
|
/// Exchange data between devices (device to device transfer). Calling this
|
||
|
/// function is only valid if GenericPlugin::isDataExchangable() passing the
|
||
|
/// two devices returns true.
|
||
|
Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
|
||
|
int64_t Size, __tgt_async_info *AsyncInfo);
|
||
|
virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
|
||
|
void *DstPtr, int64_t Size,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
||
|
|
||
|
/// Run the kernel associated with \p EntryPtr
|
||
|
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
|
||
|
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
|
||
|
|
||
|
/// Initialize a __tgt_async_info structure. Related to interop features.
|
||
|
Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
|
||
|
virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
||
|
|
||
|
/// Initialize a __tgt_device_info structure. Related to interop features.
|
||
|
Error initDeviceInfo(__tgt_device_info *DeviceInfo);
|
||
|
virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
|
||
|
|
||
|
/// Create an event.
|
||
|
Error createEvent(void **EventPtrStorage);
|
||
|
virtual Error createEventImpl(void **EventPtrStorage) = 0;
|
||
|
|
||
|
/// Destroy an event.
|
||
|
Error destroyEvent(void *Event);
|
||
|
virtual Error destroyEventImpl(void *EventPtr) = 0;
|
||
|
|
||
|
/// Start the recording of the event.
|
||
|
Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
|
||
|
virtual Error recordEventImpl(void *EventPtr,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
||
|
|
||
|
/// Wait for an event to finish. Notice this wait is asynchronous if the
|
||
|
/// __tgt_async_info is not nullptr.
|
||
|
Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
|
||
|
virtual Error waitEventImpl(void *EventPtr,
|
||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
|
||
|
|
||
|
/// Synchronize the current thread with the event.
|
||
|
Error syncEvent(void *EventPtr);
|
||
|
virtual Error syncEventImpl(void *EventPtr) = 0;
|
||
|
|
||
|
/// Print information about the device.
|
||
|
Error printInfo();
|
||
|
virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
|
||
|
|
||
|
/// Getters of the grid values.
|
||
|
uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
|
||
|
uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
|
||
|
uint32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
|
||
|
uint32_t getDefaultNumThreads() const {
|
||
|
return GridValues.GV_Default_WG_Size;
|
||
|
}
|
||
|
uint32_t getDefaultNumBlocks() const {
|
||
|
return GridValues.GV_Default_Num_Teams;
|
||
|
}
|
||
|
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
|
||
|
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
|
||
|
|
||
|
/// Get target compute unit kind (e.g., sm_80, or gfx908).
|
||
|
virtual std::string getComputeUnitKind() const { return "unknown"; }
|
||
|
|
||
|
/// Post processing after jit backend. The ownership of \p MB will be taken.
|
||
|
virtual Expected<std::unique_ptr<MemoryBuffer>>
|
||
|
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const {
|
||
|
return std::move(MB);
|
||
|
}
|
||
|
|
||
|
/// The minimum number of threads we use for a low-trip count combined loop.
|
||
|
/// Instead of using more threads we increase the outer (block/team)
|
||
|
/// parallelism.
|
||
|
/// @see OMPX_MinThreadsForLowTripCount
|
||
|
virtual uint32_t getMinThreadsForLowTripCountLoop() {
|
||
|
return OMPX_MinThreadsForLowTripCount;
|
||
|
}
|
||
|
|
||
|
/// Get the total amount of hardware parallelism supported by the target
|
||
|
/// device. This is the total amount of warps or wavefronts that can be
|
||
|
/// resident on the device simultaneously.
|
||
|
virtual uint64_t getHardwareParallelism() const { return 0; }
|
||
|
|
||
|
/// Get the RPC server running on this device.
|
||
|
RPCServerTy *getRPCServer() const { return RPCServer; }
|
||
|
|
||
|
/// The number of parallel RPC ports to use on the device. In general, this
|
||
|
/// should be roughly equivalent to the amount of hardware parallelism the
|
||
|
/// device can support. This is because GPUs in general do not have forward
|
||
|
/// progress guarantees, so we minimize thread level dependencies by
|
||
|
/// allocating enough space such that each device thread can have a port. This
|
||
|
/// is likely overly pessimistic in the average case, but guarantees no
|
||
|
/// deadlocks at the cost of memory. This must be overloaded by targets
|
||
|
/// expecting to use the RPC server.
|
||
|
virtual uint64_t requestedRPCPortCount() const {
|
||
|
assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
virtual Error getDeviceStackSize(uint64_t &V) = 0;
|
||
|
|
||
|
/// Returns true if current plugin architecture is an APU
|
||
|
/// and unified_shared_memory was not requested by the program.
|
||
|
bool useAutoZeroCopy();
|
||
|
virtual bool useAutoZeroCopyImpl() { return false; }
|
||
|
|
||
|
/// Allocate and construct a kernel object.
|
||
|
virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
|
||
|
|
||
|
private:
|
||
|
/// Get and set the stack size and heap size for the device. If not used, the
|
||
|
/// plugin can implement the setters as no-op and setting the output
|
||
|
/// value to zero for the getters.
|
||
|
virtual Error setDeviceStackSize(uint64_t V) = 0;
|
||
|
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
|
||
|
virtual Error setDeviceHeapSize(uint64_t V) = 0;
|
||
|
|
||
|
/// Indicate whether the device should setup the device environment. Notice
|
||
|
/// that returning false in this function will change the behavior of the
|
||
|
/// setupDeviceEnvironment() function.
|
||
|
virtual bool shouldSetupDeviceEnvironment() const { return true; }
|
||
|
|
||
|
/// Indicate whether the device should setup the global device memory pool. If
|
||
|
/// false is return the value on the device will be uninitialized.
|
||
|
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
|
||
|
|
||
|
/// Indicate whether or not the device should setup the RPC server. This is
|
||
|
/// only necessary for unhosted targets like the GPU.
|
||
|
virtual bool shouldSetupRPCServer() const { return false; }
|
||
|
|
||
|
/// Pointer to the memory manager or nullptr if not available.
|
||
|
MemoryManagerTy *MemoryManager;
|
||
|
|
||
|
/// Environment variables defined by the OpenMP standard.
|
||
|
Int32Envar OMP_TeamLimit;
|
||
|
Int32Envar OMP_NumTeams;
|
||
|
Int32Envar OMP_TeamsThreadLimit;
|
||
|
|
||
|
/// Environment variables defined by the LLVM OpenMP implementation.
|
||
|
Int32Envar OMPX_DebugKind;
|
||
|
UInt32Envar OMPX_SharedMemorySize;
|
||
|
UInt64Envar OMPX_TargetStackSize;
|
||
|
UInt64Envar OMPX_TargetHeapSize;
|
||
|
|
||
|
/// Environment flag to set the minimum number of threads we use for a
|
||
|
/// low-trip count combined loop. Instead of using more threads we increase
|
||
|
/// the outer (block/team) parallelism.
|
||
|
UInt32Envar OMPX_MinThreadsForLowTripCount =
|
||
|
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
|
||
|
|
||
|
protected:
|
||
|
/// Environment variables defined by the LLVM OpenMP implementation
|
||
|
/// regarding the initial number of streams and events.
|
||
|
UInt32Envar OMPX_InitialNumStreams;
|
||
|
UInt32Envar OMPX_InitialNumEvents;
|
||
|
|
||
|
/// Array of images loaded into the device. Images are automatically
|
||
|
/// deallocated by the allocator.
|
||
|
llvm::SmallVector<DeviceImageTy *> LoadedImages;
|
||
|
|
||
|
/// The identifier of the device within the plugin. Notice this is not a
|
||
|
/// global device id and is not the device id visible to the OpenMP user.
|
||
|
const int32_t DeviceId;
|
||
|
|
||
|
/// The default grid values used for this device.
|
||
|
llvm::omp::GV GridValues;
|
||
|
|
||
|
/// Enumeration used for representing the current state between two devices
|
||
|
/// two devices (both under the same plugin) for the peer access between them.
|
||
|
/// The states can be a) PENDING when the state has not been queried and needs
|
||
|
/// to be queried, b) AVAILABLE when the peer access is available to be used,
|
||
|
/// and c) UNAVAILABLE if the system does not allow it.
|
||
|
enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
|
||
|
|
||
|
/// Array of peer access states with the rest of devices. This means that if
|
||
|
/// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
|
||
|
/// the device I can access device J's memory directly. However, notice this
|
||
|
/// does not mean that device J can access device I's memory directly.
|
||
|
llvm::SmallVector<PeerAccessState> PeerAccesses;
|
||
|
std::mutex PeerAccessesLock;
|
||
|
|
||
|
/// Map of host pinned allocations used for optimize device transfers.
|
||
|
PinnedAllocationMapTy PinnedAllocs;
|
||
|
|
||
|
/// A pointer to an RPC server instance attached to this device if present.
|
||
|
/// This is used to run the RPC server during task synchronization.
|
||
|
RPCServerTy *RPCServer;
|
||
|
|
||
|
#ifdef OMPT_SUPPORT
|
||
|
/// OMPT callback functions
|
||
|
#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
|
||
|
FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
|
||
|
#undef defineOmptCallback
|
||
|
|
||
|
/// Internal representation for OMPT device (initialize & finalize)
|
||
|
std::atomic<bool> OmptInitialized;
|
||
|
#endif
|
||
|
|
||
|
private:
|
||
|
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
|
||
|
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
||
|
};
|
||
|
|
||
|
/// Class implementing common functionalities of offload plugins. Each plugin
|
||
|
/// should define the specific plugin class, derive from this generic one, and
|
||
|
/// implement the necessary virtual function members.
|
||
|
struct GenericPluginTy {
|
||
|
|
||
|
/// Construct a plugin instance.
|
||
|
GenericPluginTy(Triple::ArchType TA)
|
||
|
: RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr), JIT(TA),
|
||
|
RPCServer(nullptr) {}
|
||
|
|
||
|
virtual ~GenericPluginTy() {}
|
||
|
|
||
|
/// Initialize the plugin.
|
||
|
Error init();
|
||
|
|
||
|
/// Initialize the plugin and return the number of available devices.
|
||
|
virtual Expected<int32_t> initImpl() = 0;
|
||
|
|
||
|
/// Deinitialize the plugin and release the resources.
|
||
|
Error deinit();
|
||
|
virtual Error deinitImpl() = 0;
|
||
|
|
||
|
/// Get the reference to the device with a certain device id.
|
||
|
GenericDeviceTy &getDevice(int32_t DeviceId) {
|
||
|
assert(isValidDeviceId(DeviceId) && "Invalid device id");
|
||
|
assert(Devices[DeviceId] && "Device is unitialized");
|
||
|
|
||
|
return *Devices[DeviceId];
|
||
|
}
|
||
|
|
||
|
/// Get the number of active devices.
|
||
|
int32_t getNumDevices() const { return NumDevices; }
|
||
|
|
||
|
/// Get the plugin-specific device identifier offset.
|
||
|
int32_t getDeviceIdStartIndex() const { return DeviceIdStartIndex; }
|
||
|
|
||
|
/// Set the plugin-specific device identifier offset.
|
||
|
void setDeviceIdStartIndex(int32_t Offset) { DeviceIdStartIndex = Offset; }
|
||
|
|
||
|
/// Get the ELF code to recognize the binary image of this plugin.
|
||
|
virtual uint16_t getMagicElfBits() const = 0;
|
||
|
|
||
|
/// Get the target triple of this plugin.
|
||
|
virtual Triple::ArchType getTripleArch() const = 0;
|
||
|
|
||
|
/// Allocate a structure using the internal allocator.
|
||
|
template <typename Ty> Ty *allocate() {
|
||
|
return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
|
||
|
}
|
||
|
|
||
|
/// Get the reference to the global handler of this plugin.
|
||
|
GenericGlobalHandlerTy &getGlobalHandler() {
|
||
|
assert(GlobalHandler && "Global handler not initialized");
|
||
|
return *GlobalHandler;
|
||
|
}
|
||
|
|
||
|
/// Get the reference to the JIT used for all devices connected to this
|
||
|
/// plugin.
|
||
|
JITEngine &getJIT() { return JIT; }
|
||
|
|
||
|
/// Get a reference to the RPC server used to provide host services.
|
||
|
RPCServerTy &getRPCServer() {
|
||
|
assert(RPCServer && "RPC server not initialized");
|
||
|
return *RPCServer;
|
||
|
}
|
||
|
|
||
|
/// Get the OpenMP requires flags set for this plugin.
|
||
|
int64_t getRequiresFlags() const { return RequiresFlags; }
|
||
|
|
||
|
/// Set the OpenMP requires flags for this plugin.
|
||
|
void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; }
|
||
|
|
||
|
/// Initialize a device within the plugin.
|
||
|
Error initDevice(int32_t DeviceId);
|
||
|
|
||
|
/// Deinitialize a device within the plugin and release its resources.
|
||
|
Error deinitDevice(int32_t DeviceId);
|
||
|
|
||
|
/// Indicate whether data can be exchanged directly between two devices under
|
||
|
/// this same plugin. If this function returns true, it's safe to call the
|
||
|
/// GenericDeviceTy::exchangeData() function on the source device.
|
||
|
virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
|
||
|
return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
|
||
|
}
|
||
|
|
||
|
/// Top level interface to verify if a given ELF image can be executed on a
|
||
|
/// given target. Returns true if the \p Image is compatible with the plugin.
|
||
|
Expected<bool> checkELFImage(StringRef Image) const;
|
||
|
|
||
|
/// Indicate if an image is compatible with the plugin devices. Notice that
|
||
|
/// this function may be called before actually initializing the devices. So
|
||
|
/// we could not move this function into GenericDeviceTy.
|
||
|
virtual Expected<bool> isELFCompatible(StringRef Image) const = 0;
|
||
|
|
||
|
/// Indicate whether the plugin supports empty images.
|
||
|
virtual bool supportsEmptyImages() const { return false; }
|
||
|
|
||
|
protected:
|
||
|
/// Indicate whether a device id is valid.
|
||
|
bool isValidDeviceId(int32_t DeviceId) const {
|
||
|
return (DeviceId >= 0 && DeviceId < getNumDevices());
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
/// Number of devices available for the plugin.
|
||
|
int32_t NumDevices = 0;
|
||
|
|
||
|
/// Index offset, which when added to a DeviceId, will yield a unique
|
||
|
/// user-observable device identifier. This is especially important when
|
||
|
/// DeviceIds of multiple plugins / RTLs need to be distinguishable.
|
||
|
int32_t DeviceIdStartIndex = 0;
|
||
|
|
||
|
/// Array of pointers to the devices. Initially, they are all set to nullptr.
|
||
|
/// Once a device is initialized, the pointer is stored in the position given
|
||
|
/// by its device id. A position with nullptr means that the corresponding
|
||
|
/// device was not initialized yet.
|
||
|
llvm::SmallVector<GenericDeviceTy *> Devices;
|
||
|
|
||
|
/// OpenMP requires flags.
|
||
|
int64_t RequiresFlags;
|
||
|
|
||
|
/// Pointer to the global handler for this plugin.
|
||
|
GenericGlobalHandlerTy *GlobalHandler;
|
||
|
|
||
|
/// Internal allocator for different structures.
|
||
|
BumpPtrAllocator Allocator;
|
||
|
|
||
|
/// The JIT engine shared by all devices connected to this plugin.
|
||
|
JITEngine JIT;
|
||
|
|
||
|
/// The interface between the plugin and the GPU for host services.
|
||
|
RPCServerTy *RPCServer;
|
||
|
};
|
||
|
|
||
|
/// Class for simplifying the getter operation of the plugin. Anywhere on the
|
||
|
/// code, the current plugin can be retrieved by Plugin::get(). The class also
|
||
|
/// declares functions to create plugin-specific object instances. The check(),
|
||
|
/// createPlugin(), createDevice() and createGlobalHandler() functions should be
|
||
|
/// defined by each plugin implementation.
|
||
|
class Plugin {
|
||
|
// Reference to the plugin instance.
|
||
|
static GenericPluginTy *SpecificPlugin;
|
||
|
|
||
|
Plugin() {
|
||
|
if (auto Err = init())
|
||
|
REPORT("Failed to initialize plugin: %s\n",
|
||
|
toString(std::move(Err)).data());
|
||
|
}
|
||
|
|
||
|
~Plugin() {
|
||
|
if (auto Err = deinit())
|
||
|
REPORT("Failed to deinitialize plugin: %s\n",
|
||
|
toString(std::move(Err)).data());
|
||
|
}
|
||
|
|
||
|
Plugin(const Plugin &) = delete;
|
||
|
void operator=(const Plugin &) = delete;
|
||
|
|
||
|
/// Create and intialize the plugin instance.
|
||
|
static Error init() {
|
||
|
assert(!SpecificPlugin && "Plugin already created");
|
||
|
|
||
|
// Create the specific plugin.
|
||
|
SpecificPlugin = createPlugin();
|
||
|
assert(SpecificPlugin && "Plugin was not created");
|
||
|
|
||
|
// Initialize the plugin.
|
||
|
return SpecificPlugin->init();
|
||
|
}
|
||
|
|
||
|
// Deinitialize and destroy the plugin instance.
|
||
|
static Error deinit() {
|
||
|
assert(SpecificPlugin && "Plugin no longer valid");
|
||
|
|
||
|
for (int32_t DevNo = 0, NumDev = SpecificPlugin->getNumDevices();
|
||
|
DevNo < NumDev; ++DevNo)
|
||
|
if (auto Err = SpecificPlugin->deinitDevice(DevNo))
|
||
|
return Err;
|
||
|
|
||
|
// Deinitialize the plugin.
|
||
|
if (auto Err = SpecificPlugin->deinit())
|
||
|
return Err;
|
||
|
|
||
|
// Delete the plugin instance.
|
||
|
delete SpecificPlugin;
|
||
|
|
||
|
// Invalidate the plugin reference.
|
||
|
SpecificPlugin = nullptr;
|
||
|
|
||
|
return Plugin::success();
|
||
|
}
|
||
|
|
||
|
public:
|
||
|
/// Initialize the plugin if needed. The plugin could have been initialized by
|
||
|
/// a previous call to Plugin::get().
|
||
|
static Error initIfNeeded() {
|
||
|
// Trigger the initialization if needed.
|
||
|
get();
|
||
|
|
||
|
return Error::success();
|
||
|
}
|
||
|
|
||
|
/// Get a reference (or create if it was not created) to the plugin instance.
|
||
|
static GenericPluginTy &get() {
|
||
|
// This static variable will initialize the underlying plugin instance in
|
||
|
// case there was no previous explicit initialization. The initialization is
|
||
|
// thread safe.
|
||
|
static Plugin Plugin;
|
||
|
|
||
|
assert(SpecificPlugin && "Plugin is not active");
|
||
|
return *SpecificPlugin;
|
||
|
}
|
||
|
|
||
|
/// Get a reference to the plugin with a specific plugin-specific type.
|
||
|
template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); }
|
||
|
|
||
|
/// Indicate whether the plugin is active.
|
||
|
static bool isActive() { return SpecificPlugin != nullptr; }
|
||
|
|
||
|
/// Create a success error. This is the same as calling Error::success(), but
|
||
|
/// it is recommended to use this one for consistency with Plugin::error() and
|
||
|
/// Plugin::check().
|
||
|
static Error success() { return Error::success(); }
|
||
|
|
||
|
/// Create a string error.
|
||
|
template <typename... ArgsTy>
|
||
|
static Error error(const char *ErrFmt, ArgsTy... Args) {
|
||
|
return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
|
||
|
}
|
||
|
|
||
|
/// Check the plugin-specific error code and return an error or success
|
||
|
/// accordingly. In case of an error, create a string error with the error
|
||
|
/// description. The ErrFmt should follow the format:
|
||
|
/// "Error in <function name>[<optional info>]: %s"
|
||
|
/// The last format specifier "%s" is mandatory and will be used to place the
|
||
|
/// error code's description. Notice this function should be only called from
|
||
|
/// the plugin-specific code.
|
||
|
template <typename... ArgsTy>
|
||
|
static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
|
||
|
|
||
|
/// Create a plugin instance.
|
||
|
static GenericPluginTy *createPlugin();
|
||
|
|
||
|
/// Create a plugin-specific device.
|
||
|
static GenericDeviceTy *createDevice(int32_t DeviceId, int32_t NumDevices);
|
||
|
|
||
|
/// Create a plugin-specific global handler.
|
||
|
static GenericGlobalHandlerTy *createGlobalHandler();
|
||
|
};
|
||
|
|
||
|
/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
|
||
|
/// acts as a reference to a device resource, such as a stream, and requires
|
||
|
/// some basic functions to be implemented. The derived class should define an
|
||
|
/// empty constructor that creates an empty and invalid resource reference. Do
|
||
|
/// not create a new resource on the ctor, but on the create() function instead.
|
||
|
///
|
||
|
/// The derived class should also define the type HandleTy as the underlying
|
||
|
/// resource handle type. For instance, in a CUDA stream it would be:
|
||
|
/// using HandleTy = CUstream;
|
||
|
struct GenericDeviceResourceRef {
|
||
|
/// Create a new resource and stores a reference.
|
||
|
virtual Error create(GenericDeviceTy &Device) = 0;
|
||
|
|
||
|
/// Destroy and release the resources pointed by the reference.
|
||
|
virtual Error destroy(GenericDeviceTy &Device) = 0;
|
||
|
|
||
|
protected:
|
||
|
~GenericDeviceResourceRef() = default;
|
||
|
};
|
||
|
|
||
|
/// Class that implements a resource pool belonging to a device. This class
|
||
|
/// operates with references to the actual resources. These reference must
|
||
|
/// derive from the GenericDeviceResourceRef class and implement the create
|
||
|
/// and destroy virtual functions.
|
||
|
template <typename ResourceRef> class GenericDeviceResourceManagerTy {
|
||
|
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
|
||
|
using ResourceHandleTy = typename ResourceRef::HandleTy;
|
||
|
|
||
|
public:
|
||
|
/// Create an empty resource pool for a specific device.
|
||
|
GenericDeviceResourceManagerTy(GenericDeviceTy &Device)
|
||
|
: Device(Device), NextAvailable(0) {}
|
||
|
|
||
|
/// Destroy the resource pool. At this point, the deinit() function should
|
||
|
/// already have been executed so the resource pool should be empty.
|
||
|
virtual ~GenericDeviceResourceManagerTy() {
|
||
|
assert(ResourcePool.empty() && "Resource pool not empty");
|
||
|
}
|
||
|
|
||
|
/// Initialize the resource pool.
|
||
|
Error init(uint32_t InitialSize) {
|
||
|
assert(ResourcePool.empty() && "Resource pool already initialized");
|
||
|
return ResourcePoolTy::resizeResourcePool(InitialSize);
|
||
|
}
|
||
|
|
||
|
/// Deinitialize the resource pool and delete all resources. This function
|
||
|
/// must be called before the destructor.
|
||
|
virtual Error deinit() {
|
||
|
if (NextAvailable)
|
||
|
DP("Missing %d resources to be returned\n", NextAvailable);
|
||
|
|
||
|
// TODO: This prevents a bug on libomptarget to make the plugins fail. There
|
||
|
// may be some resources not returned. Do not destroy these ones.
|
||
|
if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
|
||
|
return Err;
|
||
|
|
||
|
ResourcePool.clear();
|
||
|
|
||
|
return Plugin::success();
|
||
|
}
|
||
|
|
||
|
/// Get a resource from the pool or create new ones. If the function
|
||
|
/// succeeds, the handle to the resource is saved in \p Handle.
|
||
|
virtual Error getResource(ResourceHandleTy &Handle) {
|
||
|
// Get a resource with an empty resource processor.
|
||
|
return getResourcesImpl(1, &Handle,
|
||
|
[](ResourceHandleTy) { return Plugin::success(); });
|
||
|
}
|
||
|
|
||
|
/// Get multiple resources from the pool or create new ones. If the function
|
||
|
/// succeeds, the handles to the resources are saved in \p Handles.
|
||
|
virtual Error getResources(uint32_t Num, ResourceHandleTy *Handles) {
|
||
|
// Get resources with an empty resource processor.
|
||
|
return getResourcesImpl(Num, Handles,
|
||
|
[](ResourceHandleTy) { return Plugin::success(); });
|
||
|
}
|
||
|
|
||
|
/// Return resource to the pool.
|
||
|
virtual Error returnResource(ResourceHandleTy Handle) {
|
||
|
// Return a resource with an empty resource processor.
|
||
|
return returnResourceImpl(
|
||
|
Handle, [](ResourceHandleTy) { return Plugin::success(); });
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
/// Get multiple resources from the pool or create new ones. If the function
|
||
|
/// succeeds, the handles to the resources are saved in \p Handles. Also
|
||
|
/// process each of the obtained resources with \p Processor.
|
||
|
template <typename FuncTy>
|
||
|
Error getResourcesImpl(uint32_t Num, ResourceHandleTy *Handles,
|
||
|
FuncTy Processor) {
|
||
|
const std::lock_guard<std::mutex> Lock(Mutex);
|
||
|
|
||
|
assert(NextAvailable <= ResourcePool.size() &&
|
||
|
"Resource pool is corrupted");
|
||
|
|
||
|
if (NextAvailable + Num > ResourcePool.size())
|
||
|
// Double the resource pool or resize it to provide the requested ones.
|
||
|
if (auto Err = ResourcePoolTy::resizeResourcePool(
|
||
|
std::max(NextAvailable * 2, NextAvailable + Num)))
|
||
|
return Err;
|
||
|
|
||
|
// Save the handles in the output array parameter.
|
||
|
for (uint32_t r = 0; r < Num; ++r)
|
||
|
Handles[r] = ResourcePool[NextAvailable + r];
|
||
|
|
||
|
// Process all obtained resources.
|
||
|
for (uint32_t r = 0; r < Num; ++r)
|
||
|
if (auto Err = Processor(Handles[r]))
|
||
|
return Err;
|
||
|
|
||
|
NextAvailable += Num;
|
||
|
|
||
|
return Plugin::success();
|
||
|
}
|
||
|
|
||
|
/// Return resource to the pool and process the resource with \p Processor.
|
||
|
template <typename FuncTy>
|
||
|
Error returnResourceImpl(ResourceHandleTy Handle, FuncTy Processor) {
|
||
|
const std::lock_guard<std::mutex> Lock(Mutex);
|
||
|
|
||
|
// Process the returned resource.
|
||
|
if (auto Err = Processor(Handle))
|
||
|
return Err;
|
||
|
|
||
|
assert(NextAvailable > 0 && "Resource pool is corrupted");
|
||
|
ResourcePool[--NextAvailable] = Handle;
|
||
|
|
||
|
return Plugin::success();
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
/// The resources between \p OldSize and \p NewSize need to be created or
|
||
|
/// destroyed. The mutex is locked when this function is called.
|
||
|
Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
|
||
|
assert(OldSize != NewSize && "Resizing to the same size");
|
||
|
|
||
|
if (auto Err = Device.setContext())
|
||
|
return Err;
|
||
|
|
||
|
if (OldSize < NewSize) {
|
||
|
// Create new resources.
|
||
|
for (uint32_t I = OldSize; I < NewSize; ++I) {
|
||
|
if (auto Err = ResourcePool[I].create(Device))
|
||
|
return Err;
|
||
|
}
|
||
|
} else {
|
||
|
// Destroy the obsolete resources.
|
||
|
for (uint32_t I = NewSize; I < OldSize; ++I) {
|
||
|
if (auto Err = ResourcePool[I].destroy(Device))
|
||
|
return Err;
|
||
|
}
|
||
|
}
|
||
|
return Plugin::success();
|
||
|
}
|
||
|
|
||
|
/// Increase or decrease the number of resources. This function should
|
||
|
/// be called with the mutex acquired.
|
||
|
Error resizeResourcePool(uint32_t NewSize) {
|
||
|
uint32_t OldSize = ResourcePool.size();
|
||
|
|
||
|
// Nothing to do.
|
||
|
if (OldSize == NewSize)
|
||
|
return Plugin::success();
|
||
|
|
||
|
if (OldSize < NewSize) {
|
||
|
// Increase the number of resources.
|
||
|
ResourcePool.resize(NewSize);
|
||
|
return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
|
||
|
}
|
||
|
|
||
|
// Decrease the number of resources otherwise.
|
||
|
auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
|
||
|
ResourcePool.resize(NewSize);
|
||
|
|
||
|
return Err;
|
||
|
}
|
||
|
|
||
|
/// The device to which the resources belong
|
||
|
GenericDeviceTy &Device;
|
||
|
|
||
|
/// Mutex for the resource pool.
|
||
|
std::mutex Mutex;
|
||
|
|
||
|
/// The next available resource in the pool.
|
||
|
uint32_t NextAvailable;
|
||
|
|
||
|
/// The actual resource pool.
|
||
|
std::deque<ResourceRef> ResourcePool;
|
||
|
};
|
||
|
|
||
|
/// A static check on whether or not we support RPC in libomptarget.
|
||
|
const bool libomptargetSupportsRPC();
|
||
|
|
||
|
} // namespace plugin
|
||
|
} // namespace target
|
||
|
} // namespace omp
|
||
|
} // namespace llvm
|
||
|
|
||
|
#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H
|