//===- Target.cpp - MLIR LLVM NVVM target compilation -----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This files defines NVVM target related functions including registration // calls for the `#nvvm.target` compilation attribute. // //===----------------------------------------------------------------------===// #include "mlir/Target/LLVM/NVVM/Target.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Target/LLVM/NVVM/Utils.h" #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/TargetSelect.h" #include using namespace mlir; using namespace mlir::NVVM; #ifndef __DEFAULT_CUDATOOLKIT_PATH__ #define __DEFAULT_CUDATOOLKIT_PATH__ "" #endif namespace { // Implementation of the `TargetAttrInterface` model. class NVVMTargetAttrImpl : public gpu::TargetAttrInterface::FallbackModel { public: std::optional> serializeToObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const; Attribute createObject(Attribute attribute, const SmallVector &object, const gpu::TargetOptions &options) const; }; } // namespace // Register the NVVM dialect, the NVVM translation & the target interface. void mlir::NVVM::registerNVVMTargetInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) { NVVMTargetAttr::attachInterface(*ctx); }); } void mlir::NVVM::registerNVVMTargetInterfaceExternalModels( MLIRContext &context) { DialectRegistry registry; registerNVVMTargetInterfaceExternalModels(registry); context.appendDialectRegistry(registry); } // Search for the CUDA toolkit path. StringRef mlir::NVVM::getCUDAToolkitPath() { if (const char *var = std::getenv("CUDA_ROOT")) return var; if (const char *var = std::getenv("CUDA_HOME")) return var; if (const char *var = std::getenv("CUDA_PATH")) return var; return __DEFAULT_CUDATOOLKIT_PATH__; } SerializeGPUModuleBase::SerializeGPUModuleBase( Operation &module, NVVMTargetAttr target, const gpu::TargetOptions &targetOptions) : ModuleToObject(module, target.getTriple(), target.getChip(), target.getFeatures(), target.getO()), target(target), toolkitPath(targetOptions.getToolkitPath()), fileList(targetOptions.getLinkFiles()) { // If `targetOptions` have an empty toolkitPath use `getCUDAToolkitPath` if (toolkitPath.empty()) toolkitPath = getCUDAToolkitPath(); // Append the files in the target attribute. if (ArrayAttr files = target.getLink()) for (Attribute attr : files.getValue()) if (auto file = dyn_cast(attr)) fileList.push_back(file.str()); // Append libdevice to the files to be loaded. (void)appendStandardLibs(); } void SerializeGPUModuleBase::init() { static llvm::once_flag initializeBackendOnce; llvm::call_once(initializeBackendOnce, []() { // If the `NVPTX` LLVM target was built, initialize it. #if LLVM_HAS_NVPTX_TARGET LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTargetMC(); LLVMInitializeNVPTXAsmPrinter(); #endif }); } NVVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; } StringRef SerializeGPUModuleBase::getToolkitPath() const { return toolkitPath; } ArrayRef SerializeGPUModuleBase::getFileList() const { return fileList; } // Try to append `libdevice` from a CUDA toolkit installation. LogicalResult SerializeGPUModuleBase::appendStandardLibs() { StringRef pathRef = getToolkitPath(); if (!pathRef.empty()) { SmallVector path; path.insert(path.begin(), pathRef.begin(), pathRef.end()); pathRef = StringRef(path.data(), path.size()); if (!llvm::sys::fs::is_directory(pathRef)) { getOperation().emitError() << "CUDA path: " << pathRef << " does not exist or is not a directory.\n"; return failure(); } llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc"); pathRef = StringRef(path.data(), path.size()); if (!llvm::sys::fs::is_regular_file(pathRef)) { getOperation().emitError() << "LibDevice path: " << pathRef << " does not exist or is not a file.\n"; return failure(); } fileList.push_back(pathRef.str()); } return success(); } std::optional>> SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) { SmallVector> bcFiles; if (failed(loadBitcodeFilesFromList(module.getContext(), fileList, bcFiles, true))) return std::nullopt; return std::move(bcFiles); } #if MLIR_CUDA_CONVERSIONS_ENABLED == 1 namespace { class NVPTXSerializer : public SerializeGPUModuleBase { public: NVPTXSerializer(Operation &module, NVVMTargetAttr target, const gpu::TargetOptions &targetOptions); gpu::GPUModuleOp getOperation(); // Compile PTX to cubin using `ptxas`. std::optional> compileToBinary(const std::string &ptxCode); // Compile PTX to cubin using the `nvptxcompiler` library. std::optional> compileToBinaryNVPTX(const std::string &ptxCode); std::optional> moduleToObject(llvm::Module &llvmModule) override; private: using TmpFile = std::pair, llvm::FileRemover>; // Create a temp file. std::optional createTemp(StringRef name, StringRef suffix); // Find the `tool` path, where `tool` is the name of the binary to search, // i.e. `ptxas` or `fatbinary`. The search order is: // 1. The toolkit path in `targetOptions`. // 2. In the system PATH. // 3. The path from `getCUDAToolkitPath()`. std::optional findTool(StringRef tool); // Target options. gpu::TargetOptions targetOptions; }; } // namespace NVPTXSerializer::NVPTXSerializer(Operation &module, NVVMTargetAttr target, const gpu::TargetOptions &targetOptions) : SerializeGPUModuleBase(module, target, targetOptions), targetOptions(targetOptions) {} std::optional NVPTXSerializer::createTemp(StringRef name, StringRef suffix) { llvm::SmallString<128> filename; std::error_code ec = llvm::sys::fs::createTemporaryFile(name, suffix, filename); if (ec) { getOperation().emitError() << "Couldn't create the temp file: `" << filename << "`, error message: " << ec.message(); return std::nullopt; } return TmpFile(filename, llvm::FileRemover(filename.c_str())); } gpu::GPUModuleOp NVPTXSerializer::getOperation() { return dyn_cast(&SerializeGPUModuleBase::getOperation()); } std::optional NVPTXSerializer::findTool(StringRef tool) { // Find the `tool` path. // 1. Check the toolkit path given in the command line. StringRef pathRef = targetOptions.getToolkitPath(); SmallVector path; if (pathRef.size()) { path.insert(path.begin(), pathRef.begin(), pathRef.end()); llvm::sys::path::append(path, "bin", tool); if (llvm::sys::fs::can_execute(path)) return StringRef(path.data(), path.size()).str(); } // 2. Check PATH. if (std::optional toolPath = llvm::sys::Process::FindInEnvPath("PATH", tool)) return *toolPath; // 3. Check `getCUDAToolkitPath()`. pathRef = getCUDAToolkitPath(); path.clear(); if (pathRef.size()) { path.insert(path.begin(), pathRef.begin(), pathRef.end()); llvm::sys::path::append(path, "bin", tool); if (llvm::sys::fs::can_execute(path)) return StringRef(path.data(), path.size()).str(); } getOperation().emitError() << "Couldn't find the `" << tool << "` binary. Please specify the toolkit " "path, add the compiler to $PATH, or set one of the environment " "variables in `NVVM::getCUDAToolkitPath()`."; return std::nullopt; } // TODO: clean this method & have a generic tool driver or never emit binaries // with this mechanism and let another stage take care of it. std::optional> NVPTXSerializer::compileToBinary(const std::string &ptxCode) { // Determine if the serializer should create a fatbinary with the PTX embeded // or a simple CUBIN binary. const bool createFatbin = targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin; // Find the `ptxas` & `fatbinary` tools. std::optional ptxasCompiler = findTool("ptxas"); if (!ptxasCompiler) return std::nullopt; std::optional fatbinaryTool = findTool("fatbinary"); if (createFatbin && !fatbinaryTool) return std::nullopt; Location loc = getOperation().getLoc(); // Base name for all temp files: mlir---. std::string basename = llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(), getTarget().getTriple(), getTarget().getChip()); // Create temp files: std::optional ptxFile = createTemp(basename, "ptx"); if (!ptxFile) return std::nullopt; std::optional logFile = createTemp(basename, "log"); if (!logFile) return std::nullopt; std::optional binaryFile = createTemp(basename, "bin"); if (!binaryFile) return std::nullopt; TmpFile cubinFile; if (createFatbin) { Twine cubinFilename = ptxFile->first + ".cubin"; cubinFile = TmpFile(cubinFilename.str(), llvm::FileRemover(cubinFilename)); } else { cubinFile.first = binaryFile->first; } std::error_code ec; // Dump the PTX to a temp file. { llvm::raw_fd_ostream ptxStream(ptxFile->first, ec); if (ec) { emitError(loc) << "Couldn't open the file: `" << ptxFile->first << "`, error message: " << ec.message(); return std::nullopt; } ptxStream << ptxCode; if (ptxStream.has_error()) { emitError(loc) << "An error occurred while writing the PTX to: `" << ptxFile->first << "`."; return std::nullopt; } ptxStream.flush(); } // Command redirects. std::optional redirects[] = { std::nullopt, logFile->first, logFile->first, }; // Get any extra args passed in `targetOptions`. std::pair> cmdOpts = targetOptions.tokenizeCmdOptions(); // Create ptxas args. std::string optLevel = std::to_string(this->optLevel); SmallVector ptxasArgs( {StringRef("ptxas"), StringRef("-arch"), getTarget().getChip(), StringRef(ptxFile->first), StringRef("-o"), StringRef(cubinFile.first), "--opt-level", optLevel}); bool useFatbin32 = false; for (auto cArg : cmdOpts.second) { // All `cmdOpts` are for `ptxas` except `-32` which passes `-32` to // `fatbinary`, indicating a 32-bit target. By default a 64-bit target is // assumed. if (StringRef arg(cArg); arg != "-32") ptxasArgs.push_back(arg); else useFatbin32 = true; } // Create the `fatbinary` args. StringRef chip = getTarget().getChip(); // Remove the arch prefix to obtain the compute capability. chip.consume_front("sm_"), chip.consume_front("compute_"); // Embed the cubin object. std::string cubinArg = llvm::formatv("--image3=kind=elf,sm={0},file={1}", chip, cubinFile.first) .str(); // Embed the PTX file so the driver can JIT if needed. std::string ptxArg = llvm::formatv("--image3=kind=ptx,sm={0},file={1}", chip, ptxFile->first) .str(); SmallVector fatbinArgs({StringRef("fatbinary"), useFatbin32 ? "-32" : "-64", cubinArg, ptxArg, "--create", binaryFile->first}); // Dump tool invocation commands. #define DEBUG_TYPE "serialize-to-binary" LLVM_DEBUG({ llvm::dbgs() << "Tool invocation for module: " << getOperation().getNameAttr() << "\n"; llvm::interleave(ptxasArgs, llvm::dbgs(), " "); llvm::dbgs() << "\n"; if (createFatbin) { llvm::interleave(fatbinArgs, llvm::dbgs(), " "); llvm::dbgs() << "\n"; } }); #undef DEBUG_TYPE // Helper function for printing tool error logs. std::string message; auto emitLogError = [&](StringRef toolName) -> std::optional> { if (message.empty()) { llvm::ErrorOr> toolStderr = llvm::MemoryBuffer::getFile(logFile->first); if (toolStderr) emitError(loc) << toolName << " invocation failed. Log:\n" << toolStderr->get()->getBuffer(); else emitError(loc) << toolName << " invocation failed."; return std::nullopt; } emitError(loc) << toolName << " invocation failed, error message: " << message; return std::nullopt; }; // Invoke PTXAS. if (llvm::sys::ExecuteAndWait(ptxasCompiler.value(), ptxasArgs, /*Env=*/std::nullopt, /*Redirects=*/redirects, /*SecondsToWait=*/0, /*MemoryLimit=*/0, /*ErrMsg=*/&message)) return emitLogError("`ptxas`"); // Invoke `fatbin`. message.clear(); if (createFatbin && llvm::sys::ExecuteAndWait(*fatbinaryTool, fatbinArgs, /*Env=*/std::nullopt, /*Redirects=*/redirects, /*SecondsToWait=*/0, /*MemoryLimit=*/0, /*ErrMsg=*/&message)) return emitLogError("`fatbinary`"); // Dump the output of the tools, helpful if the verbose flag was passed. #define DEBUG_TYPE "serialize-to-binary" LLVM_DEBUG({ llvm::ErrorOr> logBuffer = llvm::MemoryBuffer::getFile(logFile->first); if (logBuffer && (*logBuffer)->getBuffer().size()) { llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n"; llvm::dbgs().flush(); } }); #undef DEBUG_TYPE // Read the fatbin. llvm::ErrorOr> binaryBuffer = llvm::MemoryBuffer::getFile(binaryFile->first); if (!binaryBuffer) { emitError(loc) << "Couldn't open the file: `" << binaryFile->first << "`, error message: " << binaryBuffer.getError().message(); return std::nullopt; } StringRef fatbin = (*binaryBuffer)->getBuffer(); return SmallVector(fatbin.begin(), fatbin.end()); } #if MLIR_NVPTXCOMPILER_ENABLED == 1 #include "nvPTXCompiler.h" #define RETURN_ON_NVPTXCOMPILER_ERROR(expr) \ do { \ if (auto status = (expr)) { \ emitError(loc) << llvm::Twine(#expr).concat(" failed with error code ") \ << status; \ return std::nullopt; \ } \ } while (false) std::optional> NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) { Location loc = getOperation().getLoc(); nvPTXCompilerHandle compiler = nullptr; nvPTXCompileResult status; size_t logSize; // Create the options. std::string optLevel = std::to_string(this->optLevel); std::pair> cmdOpts = targetOptions.tokenizeCmdOptions(); cmdOpts.second.append( {"-arch", getTarget().getChip().data(), "--opt-level", optLevel.c_str()}); // Create the compiler handle. RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerCreate(&compiler, ptxCode.size(), ptxCode.c_str())); // Try to compile the binary. status = nvPTXCompilerCompile(compiler, cmdOpts.second.size(), cmdOpts.second.data()); // Check if compilation failed. if (status != NVPTXCOMPILE_SUCCESS) { RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerGetErrorLogSize(compiler, &logSize)); if (logSize != 0) { SmallVector log(logSize + 1, 0); RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerGetErrorLog(compiler, log.data())); emitError(loc) << "NVPTX compiler invocation failed, error log: " << log.data(); } else emitError(loc) << "NVPTX compiler invocation failed with error code: " << status; return std::nullopt; } // Retrieve the binary. size_t elfSize; RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerGetCompiledProgramSize(compiler, &elfSize)); SmallVector binary(elfSize, 0); RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerGetCompiledProgram(compiler, (void *)binary.data())); // Dump the log of the compiler, helpful if the verbose flag was passed. #define DEBUG_TYPE "serialize-to-binary" LLVM_DEBUG({ RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerGetInfoLogSize(compiler, &logSize)); if (logSize != 0) { SmallVector log(logSize + 1, 0); RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerGetInfoLog(compiler, log.data())); llvm::dbgs() << "NVPTX compiler invocation for module: " << getOperation().getNameAttr() << "\n"; llvm::dbgs() << "Arguments: "; llvm::interleave(cmdOpts.second, llvm::dbgs(), " "); llvm::dbgs() << "\nOutput\n" << log.data() << "\n"; llvm::dbgs().flush(); } }); #undef DEBUG_TYPE RETURN_ON_NVPTXCOMPILER_ERROR(nvPTXCompilerDestroy(&compiler)); return binary; } #endif // MLIR_NVPTXCOMPILER_ENABLED == 1 std::optional> NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) { // Return LLVM IR if the compilation target is offload. #define DEBUG_TYPE "serialize-to-llvm" LLVM_DEBUG({ llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr() << "\n"; llvm::dbgs() << llvmModule << "\n"; llvm::dbgs().flush(); }); #undef DEBUG_TYPE if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) return SerializeGPUModuleBase::moduleToObject(llvmModule); // Emit PTX code. std::optional targetMachine = getOrCreateTargetMachine(); if (!targetMachine) { getOperation().emitError() << "Target Machine unavailable for triple " << triple << ", can't optimize with LLVM\n"; return std::nullopt; } std::optional serializedISA = translateToISA(llvmModule, **targetMachine); if (!serializedISA) { getOperation().emitError() << "Failed translating the module to ISA."; return std::nullopt; } #define DEBUG_TYPE "serialize-to-isa" LLVM_DEBUG({ llvm::dbgs() << "PTX for module: " << getOperation().getNameAttr() << "\n"; llvm::dbgs() << *serializedISA << "\n"; llvm::dbgs().flush(); }); #undef DEBUG_TYPE // Return PTX if the compilation target is assembly. if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly) { // Make sure to include the null terminator. StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); return SmallVector(bin.begin(), bin.end()); } // Compile to binary. #if MLIR_NVPTXCOMPILER_ENABLED == 1 return compileToBinaryNVPTX(*serializedISA); #else return compileToBinary(*serializedISA); #endif // MLIR_NVPTXCOMPILER_ENABLED == 1 } #endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1 std::optional> NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const { assert(module && "The module must be non null."); if (!module) return std::nullopt; if (!mlir::isa(module)) { module->emitError("Module must be a GPU module."); return std::nullopt; } #if MLIR_CUDA_CONVERSIONS_ENABLED == 1 NVPTXSerializer serializer(*module, cast(attribute), options); serializer.init(); return serializer.run(); #else module->emitError( "The `NVPTX` target was not built. Please enable it when building LLVM."); return std::nullopt; #endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1 } Attribute NVVMTargetAttrImpl::createObject(Attribute attribute, const SmallVector &object, const gpu::TargetOptions &options) const { auto target = cast(attribute); gpu::CompilationTarget format = options.getCompilationTarget(); DictionaryAttr objectProps; Builder builder(attribute.getContext()); if (format == gpu::CompilationTarget::Assembly) objectProps = builder.getDictionaryAttr( {builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))}); return builder.getAttr( attribute, format, builder.getStringAttr(StringRef(object.data(), object.size())), objectProps); }