//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // The parts of the cuda api that are presently in use by the openmp cuda plugin // //===----------------------------------------------------------------------===// #ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED #define DYNAMIC_CUDA_CUDA_H_INCLUDED #include #include typedef int CUdevice; typedef uintptr_t CUdeviceptr; typedef struct CUmod_st *CUmodule; typedef struct CUctx_st *CUcontext; typedef struct CUfunc_st *CUfunction; typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; #define CU_DEVICE_INVALID ((CUdevice)-2) typedef unsigned long long CUmemGenericAllocationHandle_v1; typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; #define CU_DEVICE_INVALID ((CUdevice)-2) typedef enum CUmemAllocationGranularity_flags_enum { CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 } CUmemAllocationGranularity_flags; typedef enum CUmemAccess_flags_enum { CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF } CUmemAccess_flags; typedef enum CUmemLocationType_enum { CU_MEM_LOCATION_TYPE_INVALID = 0x0, CU_MEM_LOCATION_TYPE_DEVICE = 0x1, CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF } CUmemLocationType; typedef struct CUmemLocation_st { CUmemLocationType type; int id; } CUmemLocation_v1; typedef CUmemLocation_v1 CUmemLocation; typedef struct CUmemAccessDesc_st { CUmemLocation location; CUmemAccess_flags flags; } CUmemAccessDesc_v1; typedef CUmemAccessDesc_v1 CUmemAccessDesc; typedef enum CUmemAllocationType_enum { CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF } CUmemAllocationType; typedef enum CUmemAllocationHandleType_enum { CU_MEM_HANDLE_TYPE_NONE = 0x0, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, CU_MEM_HANDLE_TYPE_WIN32 = 0x2, CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF } CUmemAllocationHandleType; typedef struct CUmemAllocationProp_st { CUmemAllocationType type; CUmemAllocationHandleType requestedHandleTypes; CUmemLocation location; void *win32HandleMetaData; struct { unsigned char compressionType; unsigned char gpuDirectRDMACapable; unsigned short usage; unsigned char reserved[4]; } allocFlags; } CUmemAllocationProp_v1; typedef CUmemAllocationProp_v1 CUmemAllocationProp; typedef enum cudaError_enum { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_NO_DEVICE = 100, CUDA_ERROR_INVALID_HANDLE = 400, CUDA_ERROR_NOT_READY = 600, CUDA_ERROR_TOO_MANY_PEERS = 711, } CUresult; typedef enum CUstream_flags_enum { CU_STREAM_DEFAULT = 0x0, CU_STREAM_NON_BLOCKING = 0x1, } CUstream_flags; typedef enum CUlimit_enum { CU_LIMIT_STACK_SIZE = 0x0, CU_LIMIT_PRINTF_FIFO_SIZE = 0x1, CU_LIMIT_MALLOC_HEAP_SIZE = 0x2, CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x3, CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x4, CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x5, CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x6, CU_LIMIT_MAX } CUlimit; typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, CU_DEVICE_ATTRIBUTE_MAX, } CUdevice_attribute; typedef enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, } CUfunction_attribute; typedef enum CUctx_flags_enum { CU_CTX_SCHED_BLOCKING_SYNC = 0x04, CU_CTX_SCHED_MASK = 0x07, } CUctx_flags; typedef enum CUmemAttach_flags_enum { CU_MEM_ATTACH_GLOBAL = 0x1, CU_MEM_ATTACH_HOST = 0x2, CU_MEM_ATTACH_SINGLE = 0x4, } CUmemAttach_flags; typedef enum CUcomputeMode_enum { CU_COMPUTEMODE_DEFAULT = 0, CU_COMPUTEMODE_PROHIBITED = 2, CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, } CUcompute_mode; typedef enum CUevent_flags_enum { CU_EVENT_DEFAULT = 0x0, CU_EVENT_BLOCKING_SYNC = 0x1, CU_EVENT_DISABLE_TIMING = 0x2, CU_EVENT_INTERPROCESS = 0x4 } CUevent_flags; CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); CUresult cuDeviceGetCount(int *); CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction); // Device info CUresult cuDeviceGetName(char *, int, CUdevice); CUresult cuDeviceTotalMem(size_t *, CUdevice); CUresult cuDriverGetVersion(int *); CUresult cuGetErrorString(CUresult, const char **); CUresult cuInit(unsigned); CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, CUstream, void **, void **); CUresult cuMemAlloc(CUdeviceptr *, size_t); CUresult cuMemAllocHost(void **, size_t); CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream); CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); CUresult cuMemFree(CUdeviceptr); CUresult cuMemFreeHost(void *); CUresult cuMemFreeAsync(CUdeviceptr, CUstream); CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleUnload(CUmodule); CUresult cuStreamCreate(CUstream *, unsigned); CUresult cuStreamDestroy(CUstream); CUresult cuStreamSynchronize(CUstream); CUresult cuStreamQuery(CUstream); CUresult cuCtxSetCurrent(CUcontext); CUresult cuDevicePrimaryCtxRelease(CUdevice); CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned); CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice); CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *, void **); CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice); CUresult cuCtxEnablePeerAccess(CUcontext, unsigned); CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); CUresult cuCtxGetLimit(size_t *, CUlimit); CUresult cuCtxSetLimit(CUlimit, size_t); CUresult cuEventCreate(CUevent *, unsigned int); CUresult cuEventRecord(CUevent, CUstream); CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); CUresult cuEventSynchronize(CUevent); CUresult cuEventDestroy(CUevent); CUresult cuMemUnmap(CUdeviceptr ptr, size_t size); CUresult cuMemRelease(CUmemGenericAllocationHandle handle); CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size); CUresult cuMemGetInfo(size_t *free, size_t *total); CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); #endif