610 lines
23 KiB
C++
610 lines
23 KiB
C++
|
//===-- Loader Implementation for AMDHSA devices --------------------------===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
//
|
||
|
// This file impelements a simple loader to run images supporting the AMDHSA
|
||
|
// architecture. The file launches the '_start' kernel which should be provided
|
||
|
// by the device application start code and call ultimately call the 'main'
|
||
|
// function.
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#include "Loader.h"
|
||
|
|
||
|
#if defined(__has_include)
|
||
|
#if __has_include("hsa/hsa.h")
|
||
|
#include "hsa/hsa.h"
|
||
|
#include "hsa/hsa_ext_amd.h"
|
||
|
#elif __has_include("hsa.h")
|
||
|
#include "hsa.h"
|
||
|
#include "hsa_ext_amd.h"
|
||
|
#endif
|
||
|
#else
|
||
|
#include "hsa/hsa.h"
|
||
|
#include "hsa/hsa_ext_amd.h"
|
||
|
#endif
|
||
|
|
||
|
#include <cstdio>
|
||
|
#include <cstdlib>
|
||
|
#include <cstring>
|
||
|
#include <tuple>
|
||
|
#include <utility>
|
||
|
|
||
|
// The implicit arguments of COV5 AMDGPU kernels.
|
||
|
struct implicit_args_t {
|
||
|
uint32_t grid_size_x;
|
||
|
uint32_t grid_size_y;
|
||
|
uint32_t grid_size_z;
|
||
|
uint16_t workgroup_size_x;
|
||
|
uint16_t workgroup_size_y;
|
||
|
uint16_t workgroup_size_z;
|
||
|
uint8_t Unused0[46];
|
||
|
uint16_t grid_dims;
|
||
|
uint8_t Unused1[190];
|
||
|
};
|
||
|
|
||
|
/// Print the error code and exit if \p code indicates an error.
|
||
|
static void handle_error(hsa_status_t code) {
|
||
|
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
|
||
|
return;
|
||
|
|
||
|
const char *desc;
|
||
|
if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS)
|
||
|
desc = "Unknown error";
|
||
|
fprintf(stderr, "%s\n", desc);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
/// Generic interface for iterating using the HSA callbacks.
|
||
|
template <typename elem_ty, typename func_ty, typename callback_ty>
|
||
|
hsa_status_t iterate(func_ty func, callback_ty cb) {
|
||
|
auto l = [](elem_ty elem, void *data) -> hsa_status_t {
|
||
|
callback_ty *unwrapped = static_cast<callback_ty *>(data);
|
||
|
return (*unwrapped)(elem);
|
||
|
};
|
||
|
return func(l, static_cast<void *>(&cb));
|
||
|
}
|
||
|
|
||
|
/// Generic interface for iterating using the HSA callbacks.
|
||
|
template <typename elem_ty, typename func_ty, typename func_arg_ty,
|
||
|
typename callback_ty>
|
||
|
hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) {
|
||
|
auto l = [](elem_ty elem, void *data) -> hsa_status_t {
|
||
|
callback_ty *unwrapped = static_cast<callback_ty *>(data);
|
||
|
return (*unwrapped)(elem);
|
||
|
};
|
||
|
return func(func_arg, l, static_cast<void *>(&cb));
|
||
|
}
|
||
|
|
||
|
/// Iterate through all availible agents.
|
||
|
template <typename callback_ty>
|
||
|
hsa_status_t iterate_agents(callback_ty callback) {
|
||
|
return iterate<hsa_agent_t>(hsa_iterate_agents, callback);
|
||
|
}
|
||
|
|
||
|
/// Iterate through all availible memory pools.
|
||
|
template <typename callback_ty>
|
||
|
hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) {
|
||
|
return iterate<hsa_amd_memory_pool_t>(hsa_amd_agent_iterate_memory_pools,
|
||
|
agent, cb);
|
||
|
}
|
||
|
|
||
|
template <hsa_device_type_t flag>
|
||
|
hsa_status_t get_agent(hsa_agent_t *output_agent) {
|
||
|
// Find the first agent with a matching device type.
|
||
|
auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t {
|
||
|
hsa_device_type_t type;
|
||
|
hsa_status_t status =
|
||
|
hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type);
|
||
|
if (status != HSA_STATUS_SUCCESS)
|
||
|
return status;
|
||
|
|
||
|
if (type == flag) {
|
||
|
// Ensure that a GPU agent supports kernel dispatch packets.
|
||
|
if (type == HSA_DEVICE_TYPE_GPU) {
|
||
|
hsa_agent_feature_t features;
|
||
|
status =
|
||
|
hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features);
|
||
|
if (status != HSA_STATUS_SUCCESS)
|
||
|
return status;
|
||
|
if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH)
|
||
|
*output_agent = hsa_agent;
|
||
|
} else {
|
||
|
*output_agent = hsa_agent;
|
||
|
}
|
||
|
return HSA_STATUS_INFO_BREAK;
|
||
|
}
|
||
|
return HSA_STATUS_SUCCESS;
|
||
|
};
|
||
|
|
||
|
return iterate_agents(cb);
|
||
|
}
|
||
|
|
||
|
/// Retrieve a global memory pool with a \p flag from the agent.
|
||
|
template <hsa_amd_memory_pool_global_flag_t flag>
|
||
|
hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
|
||
|
hsa_amd_memory_pool_t *output_pool) {
|
||
|
auto cb = [&](hsa_amd_memory_pool_t memory_pool) {
|
||
|
uint32_t flags;
|
||
|
hsa_amd_segment_t segment;
|
||
|
if (auto err = hsa_amd_memory_pool_get_info(
|
||
|
memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment))
|
||
|
return err;
|
||
|
if (auto err = hsa_amd_memory_pool_get_info(
|
||
|
memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags))
|
||
|
return err;
|
||
|
|
||
|
if (segment != HSA_AMD_SEGMENT_GLOBAL)
|
||
|
return HSA_STATUS_SUCCESS;
|
||
|
|
||
|
if (flags & flag)
|
||
|
*output_pool = memory_pool;
|
||
|
|
||
|
return HSA_STATUS_SUCCESS;
|
||
|
};
|
||
|
return iterate_agent_memory_pools(agent, cb);
|
||
|
}
|
||
|
|
||
|
template <typename args_t>
|
||
|
hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
|
||
|
hsa_amd_memory_pool_t kernargs_pool,
|
||
|
hsa_amd_memory_pool_t coarsegrained_pool,
|
||
|
hsa_queue_t *queue, const LaunchParameters ¶ms,
|
||
|
const char *kernel_name, args_t kernel_args) {
|
||
|
// Look up the '_start' kernel in the loaded executable.
|
||
|
hsa_executable_symbol_t symbol;
|
||
|
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
|
||
|
executable, kernel_name, &dev_agent, &symbol))
|
||
|
return err;
|
||
|
|
||
|
// Register RPC callbacks for the malloc and free functions on HSA.
|
||
|
uint32_t device_id = 0;
|
||
|
auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
|
||
|
rpc_register_callback(
|
||
|
device_id, RPC_MALLOC,
|
||
|
[](rpc_port_t port, void *data) {
|
||
|
auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
|
||
|
auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
|
||
|
uint64_t size = buffer->data[0];
|
||
|
void *dev_ptr = nullptr;
|
||
|
if (hsa_status_t err =
|
||
|
hsa_amd_memory_pool_allocate(pool, size,
|
||
|
/*flags=*/0, &dev_ptr))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
|
||
|
buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
|
||
|
};
|
||
|
rpc_recv_and_send(port, malloc_handler, data);
|
||
|
},
|
||
|
&tuple);
|
||
|
rpc_register_callback(
|
||
|
device_id, RPC_FREE,
|
||
|
[](rpc_port_t port, void *data) {
|
||
|
auto free_handler = [](rpc_buffer_t *buffer, void *) {
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_free(
|
||
|
reinterpret_cast<void *>(buffer->data[0])))
|
||
|
handle_error(err);
|
||
|
};
|
||
|
rpc_recv_and_send(port, free_handler, data);
|
||
|
},
|
||
|
nullptr);
|
||
|
|
||
|
// Retrieve different properties of the kernel symbol used for launch.
|
||
|
uint64_t kernel;
|
||
|
uint32_t args_size;
|
||
|
uint32_t group_size;
|
||
|
uint32_t private_size;
|
||
|
bool dynamic_stack;
|
||
|
|
||
|
std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
|
||
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
|
||
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
|
||
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
|
||
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
|
||
|
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
|
||
|
|
||
|
for (auto &[info, value] : symbol_infos)
|
||
|
if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
|
||
|
return err;
|
||
|
|
||
|
// Allocate space for the kernel arguments on the host and allow the GPU agent
|
||
|
// to access it.
|
||
|
void *args;
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
|
||
|
/*flags=*/0, &args))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
|
||
|
|
||
|
// Initialie all the arguments (explicit and implicit) to zero, then set the
|
||
|
// explicit arguments to the values created above.
|
||
|
std::memset(args, 0, args_size);
|
||
|
std::memcpy(args, &kernel_args, sizeof(args_t));
|
||
|
|
||
|
// Initialize the necessary implicit arguments to the proper values.
|
||
|
bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
|
||
|
(params.num_blocks_z * params.num_threads_z != 1);
|
||
|
implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
|
||
|
reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
|
||
|
implicit_args->grid_dims = dims;
|
||
|
implicit_args->grid_size_x = params.num_threads_x;
|
||
|
implicit_args->grid_size_y = params.num_threads_y;
|
||
|
implicit_args->grid_size_z = params.num_threads_z;
|
||
|
implicit_args->workgroup_size_x = params.num_blocks_x;
|
||
|
implicit_args->workgroup_size_y = params.num_blocks_y;
|
||
|
implicit_args->workgroup_size_z = params.num_blocks_z;
|
||
|
|
||
|
// Obtain a packet from the queue.
|
||
|
uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
|
||
|
while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
|
||
|
;
|
||
|
|
||
|
const uint32_t mask = queue->size - 1;
|
||
|
hsa_kernel_dispatch_packet_t *packet =
|
||
|
static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
|
||
|
(packet_id & mask);
|
||
|
|
||
|
// Set up the packet for exeuction on the device. We currently only launch
|
||
|
// with one thread on the device, forcing the rest of the wavefront to be
|
||
|
// masked off.
|
||
|
uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||
|
packet->workgroup_size_x = params.num_threads_x;
|
||
|
packet->workgroup_size_y = params.num_threads_y;
|
||
|
packet->workgroup_size_z = params.num_threads_z;
|
||
|
packet->reserved0 = 0;
|
||
|
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
|
||
|
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
|
||
|
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
|
||
|
packet->private_segment_size =
|
||
|
dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
|
||
|
packet->group_segment_size = group_size;
|
||
|
packet->kernel_object = kernel;
|
||
|
packet->kernarg_address = args;
|
||
|
packet->reserved2 = 0;
|
||
|
// Create a signal to indicate when this packet has been completed.
|
||
|
if (hsa_status_t err =
|
||
|
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Initialize the packet header and set the doorbell signal to begin execution
|
||
|
// by the HSA runtime.
|
||
|
uint16_t header =
|
||
|
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||
|
uint32_t header_word = header | (setup << 16u);
|
||
|
__atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
|
||
|
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
|
||
|
|
||
|
// Wait until the kernel has completed execution on the device. Periodically
|
||
|
// check the RPC client for work to be performed on the server.
|
||
|
while (hsa_signal_wait_scacquire(
|
||
|
packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
|
||
|
/*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
|
||
|
if (rpc_status_t err = rpc_handle_server(device_id))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Handle the server one more time in case the kernel exited with a pending
|
||
|
// send still in flight.
|
||
|
if (rpc_status_t err = rpc_handle_server(device_id))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Destroy the resources acquired to launch the kernel and return.
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
|
||
|
handle_error(err);
|
||
|
|
||
|
return HSA_STATUS_SUCCESS;
|
||
|
}
|
||
|
|
||
|
/// Copies data from the source agent to the destination agent. The source
|
||
|
/// memory must first be pinned explicitly or allocated via HSA.
|
||
|
static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
|
||
|
const void *src, hsa_agent_t src_agent,
|
||
|
uint64_t size) {
|
||
|
// Create a memory signal to copy information between the host and device.
|
||
|
hsa_signal_t memory_signal;
|
||
|
if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal))
|
||
|
return err;
|
||
|
|
||
|
if (hsa_status_t err = hsa_amd_memory_async_copy(
|
||
|
dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal))
|
||
|
return err;
|
||
|
|
||
|
while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0,
|
||
|
UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
|
||
|
;
|
||
|
|
||
|
if (hsa_status_t err = hsa_signal_destroy(memory_signal))
|
||
|
return err;
|
||
|
|
||
|
return HSA_STATUS_SUCCESS;
|
||
|
}
|
||
|
|
||
|
int load(int argc, char **argv, char **envp, void *image, size_t size,
|
||
|
const LaunchParameters ¶ms) {
|
||
|
// Initialize the HSA runtime used to communicate with the device.
|
||
|
if (hsa_status_t err = hsa_init())
|
||
|
handle_error(err);
|
||
|
|
||
|
// Register a callback when the device encounters a memory fault.
|
||
|
if (hsa_status_t err = hsa_amd_register_system_event_handler(
|
||
|
[](const hsa_amd_event_t *event, void *) -> hsa_status_t {
|
||
|
if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT)
|
||
|
return HSA_STATUS_ERROR;
|
||
|
return HSA_STATUS_SUCCESS;
|
||
|
},
|
||
|
nullptr))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Obtain a single agent for the device and host to use the HSA memory model.
|
||
|
uint32_t num_devices = 1;
|
||
|
uint32_t device_id = 0;
|
||
|
hsa_agent_t dev_agent;
|
||
|
hsa_agent_t host_agent;
|
||
|
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_GPU>(&dev_agent))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Load the code object's ISA information and executable data segments.
|
||
|
hsa_code_object_t object;
|
||
|
if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
|
||
|
handle_error(err);
|
||
|
|
||
|
hsa_executable_t executable;
|
||
|
if (hsa_status_t err = hsa_executable_create_alt(
|
||
|
HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "",
|
||
|
&executable))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (hsa_status_t err =
|
||
|
hsa_executable_load_code_object(executable, dev_agent, object, ""))
|
||
|
handle_error(err);
|
||
|
|
||
|
// No modifications to the executable are allowed after this point.
|
||
|
if (hsa_status_t err = hsa_executable_freeze(executable, ""))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Check the validity of the loaded executable. If the agents ISA features do
|
||
|
// not match the executable's code object it will fail here.
|
||
|
uint32_t result;
|
||
|
if (hsa_status_t err = hsa_executable_validate(executable, &result))
|
||
|
handle_error(err);
|
||
|
if (result)
|
||
|
handle_error(HSA_STATUS_ERROR);
|
||
|
|
||
|
// Obtain memory pools to exchange data between the host and the device. The
|
||
|
// fine-grained pool acts as pinned memory on the host for DMA transfers to
|
||
|
// the device, the coarse-grained pool is for allocations directly on the
|
||
|
// device, and the kernerl-argument pool is for executing the kernel.
|
||
|
hsa_amd_memory_pool_t kernargs_pool;
|
||
|
hsa_amd_memory_pool_t finegrained_pool;
|
||
|
hsa_amd_memory_pool_t coarsegrained_pool;
|
||
|
if (hsa_status_t err =
|
||
|
get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT>(
|
||
|
host_agent, &kernargs_pool))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err =
|
||
|
get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED>(
|
||
|
host_agent, &finegrained_pool))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err =
|
||
|
get_agent_memory_pool<HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED>(
|
||
|
dev_agent, &coarsegrained_pool))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Allocate fine-grained memory on the host to hold the pointer array for the
|
||
|
// copied argv and allow the GPU agent to access it.
|
||
|
auto allocator = [&](uint64_t size) -> void * {
|
||
|
void *dev_ptr = nullptr;
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
|
||
|
/*flags=*/0, &dev_ptr))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
|
||
|
return dev_ptr;
|
||
|
};
|
||
|
void *dev_argv = copy_argument_vector(argc, argv, allocator);
|
||
|
if (!dev_argv)
|
||
|
handle_error("Failed to allocate device argv");
|
||
|
|
||
|
// Allocate fine-grained memory on the host to hold the pointer array for the
|
||
|
// copied environment array and allow the GPU agent to access it.
|
||
|
void *dev_envp = copy_environment(envp, allocator);
|
||
|
if (!dev_envp)
|
||
|
handle_error("Failed to allocate device environment");
|
||
|
|
||
|
// Allocate space for the return pointer and initialize it to zero.
|
||
|
void *dev_ret;
|
||
|
if (hsa_status_t err =
|
||
|
hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int),
|
||
|
/*flags=*/0, &dev_ret))
|
||
|
handle_error(err);
|
||
|
hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
|
||
|
|
||
|
// Allocate finegrained memory for the RPC server and client to share.
|
||
|
uint32_t wavefront_size = 0;
|
||
|
if (hsa_status_t err = hsa_agent_get_info(
|
||
|
dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Set up the RPC server.
|
||
|
if (rpc_status_t err = rpc_init(num_devices))
|
||
|
handle_error(err);
|
||
|
auto tuple = std::make_tuple(dev_agent, finegrained_pool);
|
||
|
auto rpc_alloc = [](uint64_t size, void *data) {
|
||
|
auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
|
||
|
void *dev_ptr = nullptr;
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
|
||
|
/*flags=*/0, &dev_ptr))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
|
||
|
return dev_ptr;
|
||
|
};
|
||
|
if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT,
|
||
|
wavefront_size, rpc_alloc, &tuple))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Register callbacks for the RPC unit tests.
|
||
|
if (wavefront_size == 32)
|
||
|
register_rpc_callbacks<32>(device_id);
|
||
|
else if (wavefront_size == 64)
|
||
|
register_rpc_callbacks<64>(device_id);
|
||
|
else
|
||
|
handle_error("Invalid wavefront size");
|
||
|
|
||
|
// Initialize the RPC client on the device by copying the local data to the
|
||
|
// device's internal pointer.
|
||
|
hsa_executable_symbol_t rpc_client_sym;
|
||
|
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
|
||
|
executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
|
||
|
handle_error(err);
|
||
|
|
||
|
void *rpc_client_host;
|
||
|
if (hsa_status_t err =
|
||
|
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(void *),
|
||
|
/*flags=*/0, &rpc_client_host))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_client_host);
|
||
|
|
||
|
void *rpc_client_dev;
|
||
|
if (hsa_status_t err = hsa_executable_symbol_get_info(
|
||
|
rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
|
||
|
&rpc_client_dev))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Copy the address of the client buffer from the device to the host.
|
||
|
if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
|
||
|
dev_agent, sizeof(void *)))
|
||
|
handle_error(err);
|
||
|
|
||
|
void *rpc_client_buffer;
|
||
|
if (hsa_status_t err = hsa_amd_memory_lock(
|
||
|
const_cast<void *>(rpc_get_client_buffer(device_id)),
|
||
|
rpc_get_client_size(),
|
||
|
/*agents=*/nullptr, 0, &rpc_client_buffer))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Copy the RPC client buffer to the address pointed to by the symbol.
|
||
|
if (hsa_status_t err =
|
||
|
hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
|
||
|
rpc_client_buffer, host_agent, rpc_get_client_size()))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (hsa_status_t err = hsa_amd_memory_unlock(
|
||
|
const_cast<void *>(rpc_get_client_buffer(device_id))))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
|
||
|
// If the clock_freq symbol is missing, no work to do.
|
||
|
hsa_executable_symbol_t freq_sym;
|
||
|
if (HSA_STATUS_SUCCESS ==
|
||
|
hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
|
||
|
&dev_agent, &freq_sym)) {
|
||
|
|
||
|
void *host_clock_freq;
|
||
|
if (hsa_status_t err =
|
||
|
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
|
||
|
/*flags=*/0, &host_clock_freq))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
|
||
|
|
||
|
if (HSA_STATUS_SUCCESS ==
|
||
|
hsa_agent_get_info(dev_agent,
|
||
|
static_cast<hsa_agent_info_t>(
|
||
|
HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
|
||
|
host_clock_freq)) {
|
||
|
|
||
|
void *freq_addr;
|
||
|
if (hsa_status_t err = hsa_executable_symbol_get_info(
|
||
|
freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
|
||
|
&freq_addr))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
|
||
|
host_agent, sizeof(uint64_t)))
|
||
|
handle_error(err);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Obtain a queue with the minimum (power of two) size, used to send commands
|
||
|
// to the HSA runtime and launch execution on the device.
|
||
|
uint64_t queue_size;
|
||
|
if (hsa_status_t err = hsa_agent_get_info(
|
||
|
dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
|
||
|
handle_error(err);
|
||
|
hsa_queue_t *queue = nullptr;
|
||
|
if (hsa_status_t err =
|
||
|
hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
|
||
|
nullptr, UINT32_MAX, UINT32_MAX, &queue))
|
||
|
handle_error(err);
|
||
|
|
||
|
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
|
||
|
begin_args_t init_args = {argc, dev_argv, dev_envp};
|
||
|
if (hsa_status_t err = launch_kernel(
|
||
|
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
|
||
|
single_threaded_params, "_begin.kd", init_args))
|
||
|
handle_error(err);
|
||
|
|
||
|
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
|
||
|
if (hsa_status_t err =
|
||
|
launch_kernel(dev_agent, executable, kernargs_pool,
|
||
|
coarsegrained_pool, queue, params, "_start.kd", args))
|
||
|
handle_error(err);
|
||
|
|
||
|
void *host_ret;
|
||
|
if (hsa_status_t err =
|
||
|
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int),
|
||
|
/*flags=*/0, &host_ret))
|
||
|
handle_error(err);
|
||
|
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret);
|
||
|
|
||
|
if (hsa_status_t err =
|
||
|
hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int)))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Save the return value and perform basic clean-up.
|
||
|
int ret = *static_cast<int *>(host_ret);
|
||
|
|
||
|
end_args_t fini_args = {ret};
|
||
|
if (hsa_status_t err = launch_kernel(
|
||
|
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
|
||
|
single_threaded_params, "_end.kd", fini_args))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (rpc_status_t err = rpc_server_shutdown(
|
||
|
device_id, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
|
||
|
nullptr))
|
||
|
handle_error(err);
|
||
|
|
||
|
// Free the memory allocated for the device.
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (hsa_status_t err = hsa_queue_destroy(queue))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (hsa_status_t err = hsa_executable_destroy(executable))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (hsa_status_t err = hsa_code_object_destroy(object))
|
||
|
handle_error(err);
|
||
|
|
||
|
if (rpc_status_t err = rpc_shutdown())
|
||
|
handle_error(err);
|
||
|
if (hsa_status_t err = hsa_shut_down())
|
||
|
handle_error(err);
|
||
|
|
||
|
return ret;
|
||
|
}
|