//===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass that serializes a gpu module into HSAco blob and // adds that blob as a string attribute of the module. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/IR/Location.h" #include "mlir/IR/MLIRContext.h" #if MLIR_GPU_TO_HSACO_PASS_ENABLE #include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/Threading.h" #include "llvm/Support/WithColor.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO/Internalize.h" #include using namespace mlir; namespace { class SerializeToHsacoPass : public PassWrapper { static llvm::once_flag initializeBackendOnce; public: MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToHsacoPass) SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, int optLevel); SerializeToHsacoPass(const SerializeToHsacoPass &other); StringRef getArgument() const override { return "gpu-to-hsaco"; } StringRef getDescription() const override { return "Lower GPU kernel function to HSACO binary annotations"; } protected: Option rocmPath{*this, "rocm-path", llvm::cl::desc("Path to ROCm install")}; // Overload to allow linking in device libs std::unique_ptr translateToLLVMIR(llvm::LLVMContext &llvmContext) override; private: // Loads LLVM bitcode libraries std::optional, 3>> loadLibraries(SmallVectorImpl &path, SmallVectorImpl &libraries, llvm::LLVMContext &context); // Serializes ROCDL to HSACO. std::unique_ptr> serializeISA(const std::string &isa) override; LogicalResult assembleIsa(const std::string &isa, SmallVectorImpl &result); std::unique_ptr> createHsaco(ArrayRef isaBinary); std::string getRocmPath(); }; } // namespace SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other) : PassWrapper(other) {} /// Get a user-specified path to ROCm // Tries, in order, the --rocm-path option, the ROCM_PATH environment variable // and a compile-time default std::string SerializeToHsacoPass::getRocmPath() { if (rocmPath.getNumOccurrences() > 0) return rocmPath.getValue(); return __DEFAULT_ROCM_PATH__; } // Sets the 'option' to 'value' unless it already has a value. static void maybeSetOption(Pass::Option &option, function_ref getValue) { if (!option.hasValue()) option = getValue(); } llvm::once_flag SerializeToHsacoPass::initializeBackendOnce; SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, int optLevel) { // No matter how this pass is constructed, ensure that the AMDGPU backend // is initialized exactly once. llvm::call_once(initializeBackendOnce, []() { // Initialize LLVM AMDGPU backend. LLVMInitializeAMDGPUAsmParser(); LLVMInitializeAMDGPUAsmPrinter(); LLVMInitializeAMDGPUTarget(); LLVMInitializeAMDGPUTargetInfo(); LLVMInitializeAMDGPUTargetMC(); }); maybeSetOption(this->triple, [&triple] { return triple.str(); }); maybeSetOption(this->chip, [&arch] { return arch.str(); }); maybeSetOption(this->features, [&features] { return features.str(); }); if (this->optLevel.getNumOccurrences() == 0) this->optLevel.setValue(optLevel); } std::optional, 3>> SerializeToHsacoPass::loadLibraries(SmallVectorImpl &path, SmallVectorImpl &libraries, llvm::LLVMContext &context) { SmallVector, 3> ret; size_t dirLength = path.size(); if (!llvm::sys::fs::is_directory(path)) { getOperation().emitRemark() << "Bitcode path: " << path << " does not exist or is not a directory\n"; return std::nullopt; } for (const StringRef file : libraries) { llvm::SMDiagnostic error; llvm::sys::path::append(path, file); llvm::StringRef pathRef(path.data(), path.size()); std::unique_ptr library = llvm::getLazyIRFileModule(pathRef, error, context); path.truncate(dirLength); if (!library) { getOperation().emitError() << "Failed to load library " << file << " from " << path << error.getMessage(); return std::nullopt; } // Some ROCM builds don't strip this like they should if (auto *openclVersion = library->getNamedMetadata("opencl.ocl.version")) library->eraseNamedMetadata(openclVersion); // Stop spamming us with clang version numbers if (auto *ident = library->getNamedMetadata("llvm.ident")) library->eraseNamedMetadata(ident); ret.push_back(std::move(library)); } return std::move(ret); } std::unique_ptr SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) { // MLIR -> LLVM translation std::unique_ptr ret = gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext); if (!ret) { getOperation().emitOpError("Module lowering failed"); return ret; } // Walk the LLVM module in order to determine if we need to link in device // libs bool needOpenCl = false; bool needOckl = false; bool needOcml = false; for (llvm::Function &f : ret->functions()) { if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) { StringRef funcName = f.getName(); if ("printf" == funcName) needOpenCl = true; if (funcName.starts_with("__ockl_")) needOckl = true; if (funcName.starts_with("__ocml_")) needOcml = true; } } if (needOpenCl) needOcml = needOckl = true; // No libraries needed (the typical case) if (!(needOpenCl || needOcml || needOckl)) return ret; // Define one of the control constants the ROCm device libraries expect to be // present These constants can either be defined in the module or can be // imported by linking in bitcode that defines the constant. To simplify our // logic, we define the constants into the module we are compiling auto addControlConstant = [&module = *ret](StringRef name, uint32_t value, uint32_t bitwidth) { using llvm::GlobalVariable; if (module.getNamedGlobal(name)) { return; } llvm::IntegerType *type = llvm::IntegerType::getIntNTy(module.getContext(), bitwidth); auto *initializer = llvm::ConstantInt::get(type, value, /*isSigned=*/false); auto *constant = new GlobalVariable( module, type, /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage, initializer, name, /*before=*/nullptr, /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal, /*addressSpace=*/4); constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local); constant->setVisibility( GlobalVariable::VisibilityTypes::ProtectedVisibility); constant->setAlignment(llvm::MaybeAlign(bitwidth / 8)); }; // Set up control variables in the module instead of linking in tiny bitcode if (needOcml) { // TODO(kdrewnia): Enable math optimizations once we have support for // `-ffast-math`-like options addControlConstant("__oclc_finite_only_opt", 0, 8); addControlConstant("__oclc_daz_opt", 0, 8); addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8); addControlConstant("__oclc_unsafe_math_opt", 0, 8); } if (needOcml || needOckl) { addControlConstant("__oclc_wavefrontsize64", 1, 8); StringRef chipSet = this->chip.getValue(); if (chipSet.starts_with("gfx")) chipSet = chipSet.substr(3); uint32_t minor = llvm::APInt(32, chipSet.substr(chipSet.size() - 2), 16).getZExtValue(); uint32_t major = llvm::APInt(32, chipSet.substr(0, chipSet.size() - 2), 10) .getZExtValue(); uint32_t isaNumber = minor + 1000 * major; addControlConstant("__oclc_ISA_version", isaNumber, 32); // This constant must always match the default code object ABI version // of the AMDGPU backend. addControlConstant("__oclc_ABI_version", 500, 32); } // Determine libraries we need to link - order matters due to dependencies llvm::SmallVector libraries; if (needOpenCl) libraries.push_back("opencl.bc"); if (needOcml) libraries.push_back("ocml.bc"); if (needOckl) libraries.push_back("ockl.bc"); std::optional, 3>> mbModules; std::string theRocmPath = getRocmPath(); llvm::SmallString<32> bitcodePath(theRocmPath); llvm::sys::path::append(bitcodePath, "amdgcn", "bitcode"); mbModules = loadLibraries(bitcodePath, libraries, llvmContext); if (!mbModules) { getOperation() .emitWarning("Could not load required device libraries") .attachNote() << "This will probably cause link-time or run-time failures"; return ret; // We can still abort here } llvm::Linker linker(*ret); for (std::unique_ptr &libModule : *mbModules) { // This bitcode linking code is substantially similar to what is used in // hip-clang It imports the library functions into the module, allowing LLVM // optimization passes (which must run after linking) to optimize across the // libraries and the module's code. We also only import symbols if they are // referenced by the module or a previous library since there will be no // other source of references to those symbols in this compilation and since // we don't want to bloat the resulting code object. bool err = linker.linkInModule( std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded, [](llvm::Module &m, const StringSet<> &gvs) { llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) { return !gv.hasName() || (gvs.count(gv.getName()) == 0); }); }); // True is linker failure if (err) { getOperation().emitError( "Unrecoverable failure during device library linking."); // We have no guaranties about the state of `ret`, so bail return nullptr; } } return ret; } LogicalResult SerializeToHsacoPass::assembleIsa(const std::string &isa, SmallVectorImpl &result) { auto loc = getOperation().getLoc(); llvm::raw_svector_ostream os(result); llvm::Triple triple(llvm::Triple::normalize(this->triple)); std::string error; const llvm::Target *target = llvm::TargetRegistry::lookupTarget(triple.normalize(), error); if (!target) return emitError(loc, Twine("failed to lookup target: ") + error); llvm::SourceMgr srcMgr; srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc()); const llvm::MCTargetOptions mcOptions; std::unique_ptr mri( target->createMCRegInfo(this->triple)); std::unique_ptr mai( target->createMCAsmInfo(*mri, this->triple, mcOptions)); mai->setRelaxELFRelocations(true); std::unique_ptr sti( target->createMCSubtargetInfo(this->triple, this->chip, this->features)); llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr, &mcOptions); std::unique_ptr mofi(target->createMCObjectFileInfo( ctx, /*PIC=*/false, /*LargeCodeModel=*/false)); ctx.setObjectFileInfo(mofi.get()); SmallString<128> cwd; if (!llvm::sys::fs::current_path(cwd)) ctx.setCompilationDir(cwd); std::unique_ptr mcStreamer; std::unique_ptr mcii(target->createMCInstrInfo()); llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx); llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions); mcStreamer.reset(target->createMCObjectStreamer( triple, ctx, std::unique_ptr(mab), mab->createObjectWriter(os), std::unique_ptr(ce), *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ false)); mcStreamer->setUseAssemblerInfoForParsing(true); std::unique_ptr parser( createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); std::unique_ptr tap( target->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); if (!tap) return emitError(loc, "assembler initialization error"); parser->setTargetParser(*tap); parser->Run(false); return success(); } std::unique_ptr> SerializeToHsacoPass::createHsaco(ArrayRef isaBinary) { auto loc = getOperation().getLoc(); // Save the ISA binary to a temp file. int tempIsaBinaryFd = -1; SmallString<128> tempIsaBinaryFilename; if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename)) { emitError(loc, "temporary file for ISA binary creation error"); return {}; } llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename); llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size()); tempIsaBinaryOs.close(); // Create a temp file for HSA code object. SmallString<128> tempHsacoFilename; if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFilename)) { emitError(loc, "temporary file for HSA code object creation error"); return {}; } llvm::FileRemover cleanupHsaco(tempHsacoFilename); std::string theRocmPath = getRocmPath(); llvm::SmallString<32> lldPath(theRocmPath); llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld"); int lldResult = llvm::sys::ExecuteAndWait( lldPath, {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename}); if (lldResult != 0) { emitError(loc, "lld invocation error"); return {}; } // Load the HSA code object. auto hsacoFile = llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false); if (!hsacoFile) { emitError(loc, "read HSA code object from temp file error"); return {}; } StringRef buffer = (*hsacoFile)->getBuffer(); return std::make_unique>(buffer.begin(), buffer.end()); } std::unique_ptr> SerializeToHsacoPass::serializeISA(const std::string &isa) { SmallVector isaBinary; if (failed(assembleIsa(isa, isaBinary))) return {}; return createHsaco(isaBinary); } // Register pass to serialize GPU kernel functions to a HSACO binary annotation. void mlir::registerGpuSerializeToHsacoPass() { PassRegistration registerSerializeToHSACO([] { return std::make_unique("amdgcn-amd-amdhsa", "", "", 2); }); } /// Create an instance of the GPU kernel function to HSAco binary serialization /// pass. std::unique_ptr mlir::createGpuSerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, int optLevel) { return std::make_unique(triple, arch, features, optLevel); } #else // MLIR_GPU_TO_HSACO_PASS_ENABLE void mlir::registerGpuSerializeToHsacoPass() {} #endif // MLIR_GPU_TO_HSACO_PASS_ENABLE