425 lines
19 KiB
C++
425 lines
19 KiB
C++
|
//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
//
|
||
|
// This file implements a pass to generate ROCDLIR operations for higher-level
|
||
|
// GPU operations.
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
|
||
|
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
|
||
|
#include "mlir/Dialect/Arith/Transforms/Passes.h"
|
||
|
#include "mlir/Pass/Pass.h"
|
||
|
#include "mlir/Pass/PassManager.h"
|
||
|
#include "mlir/Transforms/Passes.h"
|
||
|
|
||
|
#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
|
||
|
#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
|
||
|
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
|
||
|
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
|
||
|
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
|
||
|
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
|
||
|
#include "mlir/Conversion/LLVMCommon/Pattern.h"
|
||
|
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
|
||
|
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
|
||
|
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
|
||
|
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
|
||
|
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
||
|
#include "mlir/Dialect/GPU/Transforms/Passes.h"
|
||
|
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
||
|
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
|
||
|
#include "mlir/Dialect/Math/IR/Math.h"
|
||
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||
|
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
||
|
#include "mlir/IR/BuiltinAttributes.h"
|
||
|
#include "mlir/Pass/Pass.h"
|
||
|
#include "mlir/Transforms/DialectConversion.h"
|
||
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||
|
#include "llvm/Support/FormatVariadic.h"
|
||
|
|
||
|
#include "../GPUCommon/GPUOpsLowering.h"
|
||
|
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
|
||
|
#include "../GPUCommon/OpToFuncCallLowering.h"
|
||
|
|
||
|
namespace mlir {
|
||
|
#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
|
||
|
#include "mlir/Conversion/Passes.h.inc"
|
||
|
} // namespace mlir
|
||
|
|
||
|
using namespace mlir;
|
||
|
|
||
|
/// Returns true if the given `gpu.func` can be safely called using the bare
|
||
|
/// pointer calling convention.
|
||
|
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
|
||
|
bool canBeBare = true;
|
||
|
for (Type type : func.getArgumentTypes())
|
||
|
if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
|
||
|
canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
|
||
|
return canBeBare;
|
||
|
}
|
||
|
|
||
|
Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
|
||
|
const unsigned indexBitwidth) {
|
||
|
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
|
||
|
Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);
|
||
|
Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);
|
||
|
Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
|
||
|
ValueRange{minus1, zero});
|
||
|
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
|
||
|
ValueRange{minus1, mbcntLo});
|
||
|
return laneId;
|
||
|
}
|
||
|
|
||
|
namespace {
|
||
|
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
|
||
|
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
|
||
|
|
||
|
LogicalResult
|
||
|
matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
|
||
|
ConversionPatternRewriter &rewriter) const override {
|
||
|
auto loc = op->getLoc();
|
||
|
MLIRContext *context = rewriter.getContext();
|
||
|
// convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
|
||
|
// followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
|
||
|
|
||
|
Type intTy = IntegerType::get(context, 32);
|
||
|
Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);
|
||
|
Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);
|
||
|
Value mbcntLo =
|
||
|
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
|
||
|
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
|
||
|
loc, intTy, ValueRange{minus1, mbcntLo});
|
||
|
// Truncate or extend the result depending on the index bitwidth specified
|
||
|
// by the LLVMTypeConverter options.
|
||
|
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
|
||
|
if (indexBitwidth > 32) {
|
||
|
laneId = rewriter.create<LLVM::SExtOp>(
|
||
|
loc, IntegerType::get(context, indexBitwidth), laneId);
|
||
|
} else if (indexBitwidth < 32) {
|
||
|
laneId = rewriter.create<LLVM::TruncOp>(
|
||
|
loc, IntegerType::get(context, indexBitwidth), laneId);
|
||
|
}
|
||
|
rewriter.replaceOp(op, {laneId});
|
||
|
return success();
|
||
|
}
|
||
|
};
|
||
|
|
||
|
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
|
||
|
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
|
||
|
|
||
|
/// Lowers a shuffle to the corresponding ROCDL ops.
|
||
|
///
|
||
|
/// Use the `width` argument to see if src lane is participating.
|
||
|
/// If not the dstLane would be itself.
|
||
|
///
|
||
|
/// Shuffle with DS Bpermute:
|
||
|
/// let shflMode = [xor, up, down, idx]
|
||
|
/// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
|
||
|
/// 1. curLaneId = using mbcnt.lo + mbcnt.hi
|
||
|
/// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
|
||
|
/// 3. dstLane = shflMode(curLaneId, step)
|
||
|
/// 4. isActiveSrcLane = dstLane < isActiveSrcLane
|
||
|
/// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
|
||
|
/// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
|
||
|
/// 7. bpermute(dwordAlignedDstLane, shfl_value).
|
||
|
///
|
||
|
LogicalResult
|
||
|
matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
|
||
|
ConversionPatternRewriter &rewriter) const override {
|
||
|
Location loc = op->getLoc();
|
||
|
// TODO: Add support for non 32-bit shuffle values.
|
||
|
if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
|
||
|
return failure();
|
||
|
const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
|
||
|
Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
|
||
|
|
||
|
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
|
||
|
Value width = adaptor.getWidth();
|
||
|
Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
|
||
|
Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
|
||
|
Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
|
||
|
Value widthOrZeroIfOutside =
|
||
|
rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
|
||
|
Value dstLane;
|
||
|
// TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN.
|
||
|
// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
|
||
|
// perf.
|
||
|
switch (op.getMode()) {
|
||
|
case gpu::ShuffleMode::XOR:
|
||
|
dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
|
||
|
adaptor.getOffset());
|
||
|
break;
|
||
|
case gpu::ShuffleMode::IDX:
|
||
|
dstLane = adaptor.getOffset();
|
||
|
break;
|
||
|
default:
|
||
|
return failure();
|
||
|
}
|
||
|
Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
|
||
|
loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
|
||
|
Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
|
||
|
dstLane, srcLaneId);
|
||
|
Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
|
||
|
Value dwordAlignedDstLane =
|
||
|
rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
|
||
|
Value initShflValue = adaptor.getValue();
|
||
|
if (adaptor.getValue().getType().isF32()) {
|
||
|
initShflValue =
|
||
|
rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
|
||
|
}
|
||
|
Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
|
||
|
loc, int32Type, dwordAlignedDstLane, initShflValue);
|
||
|
if (adaptor.getValue().getType().isF32()) {
|
||
|
shflValue = rewriter.create<LLVM::BitcastOp>(
|
||
|
loc, adaptor.getValue().getType(), shflValue);
|
||
|
}
|
||
|
rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
|
||
|
return success();
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/// Import the GPU Ops to ROCDL Patterns.
|
||
|
#include "GPUToROCDL.cpp.inc"
|
||
|
|
||
|
// A pass that replaces all occurrences of GPU device operations with their
|
||
|
// corresponding ROCDL equivalent.
|
||
|
//
|
||
|
// This pass only handles device code and is not meant to be run on GPU host
|
||
|
// code.
|
||
|
struct LowerGpuOpsToROCDLOpsPass
|
||
|
: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
|
||
|
LowerGpuOpsToROCDLOpsPass() = default;
|
||
|
LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
|
||
|
bool useBarePtrCallConv,
|
||
|
gpu::amd::Runtime runtime) {
|
||
|
if (this->chipset.getNumOccurrences() == 0)
|
||
|
this->chipset = chipset;
|
||
|
if (this->indexBitwidth.getNumOccurrences() == 0)
|
||
|
this->indexBitwidth = indexBitwidth;
|
||
|
if (this->useBarePtrCallConv.getNumOccurrences() == 0)
|
||
|
this->useBarePtrCallConv = useBarePtrCallConv;
|
||
|
if (this->runtime.getNumOccurrences() == 0)
|
||
|
this->runtime = runtime;
|
||
|
}
|
||
|
|
||
|
void runOnOperation() override {
|
||
|
gpu::GPUModuleOp m = getOperation();
|
||
|
MLIRContext *ctx = m.getContext();
|
||
|
|
||
|
// Request C wrapper emission.
|
||
|
for (auto func : m.getOps<func::FuncOp>()) {
|
||
|
func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
|
||
|
UnitAttr::get(ctx));
|
||
|
}
|
||
|
|
||
|
FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
|
||
|
if (failed(maybeChipset)) {
|
||
|
emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
|
||
|
return signalPassFailure();
|
||
|
}
|
||
|
|
||
|
/// Customize the bitwidth used for the device side index computations.
|
||
|
LowerToLLVMOptions options(
|
||
|
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
|
||
|
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
|
||
|
options.overrideIndexBitwidth(indexBitwidth);
|
||
|
|
||
|
if (useBarePtrCallConv) {
|
||
|
options.useBarePtrCallConv = true;
|
||
|
WalkResult canUseBarePointers =
|
||
|
m.walk([](gpu::GPUFuncOp func) -> WalkResult {
|
||
|
if (canBeCalledWithBarePointers(func))
|
||
|
return WalkResult::advance();
|
||
|
return WalkResult::interrupt();
|
||
|
});
|
||
|
if (canUseBarePointers.wasInterrupted()) {
|
||
|
emitError(UnknownLoc::get(ctx),
|
||
|
"bare pointer calling convention requires all memrefs to "
|
||
|
"have static shape and use the identity map");
|
||
|
return signalPassFailure();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Apply in-dialect lowering. In-dialect lowering will replace
|
||
|
// ops which need to be lowered further, which is not supported by a
|
||
|
// single conversion pass.
|
||
|
{
|
||
|
RewritePatternSet patterns(ctx);
|
||
|
populateGpuRewritePatterns(patterns);
|
||
|
arith::populateExpandBFloat16Patterns(patterns);
|
||
|
(void)applyPatternsAndFoldGreedily(m, std::move(patterns));
|
||
|
}
|
||
|
|
||
|
LLVMTypeConverter converter(ctx, options);
|
||
|
populateGpuMemorySpaceAttributeConversions(
|
||
|
converter, [](gpu::AddressSpace space) {
|
||
|
switch (space) {
|
||
|
case gpu::AddressSpace::Global:
|
||
|
return 1;
|
||
|
case gpu::AddressSpace::Workgroup:
|
||
|
return 3;
|
||
|
case gpu::AddressSpace::Private:
|
||
|
return 5;
|
||
|
}
|
||
|
llvm_unreachable("unknown address space enum value");
|
||
|
return 0;
|
||
|
});
|
||
|
|
||
|
RewritePatternSet llvmPatterns(ctx);
|
||
|
|
||
|
mlir::arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
|
||
|
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
|
||
|
*maybeChipset);
|
||
|
populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
|
||
|
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
|
||
|
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
|
||
|
populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
|
||
|
populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
|
||
|
LLVMConversionTarget target(getContext());
|
||
|
configureGpuToROCDLConversionLegality(target);
|
||
|
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
|
||
|
signalPassFailure();
|
||
|
|
||
|
// Manually rewrite known block size attributes so the LLVMIR translation
|
||
|
// infrastructure can pick them up.
|
||
|
m.walk([ctx](LLVM::LLVMFuncOp op) {
|
||
|
if (auto blockSizes = dyn_cast_or_null<DenseI32ArrayAttr>(
|
||
|
op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()))) {
|
||
|
op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(),
|
||
|
blockSizes);
|
||
|
// Also set up the rocdl.flat_work_group_size attribute to prevent
|
||
|
// conflicting metadata.
|
||
|
uint32_t flatSize = 1;
|
||
|
for (uint32_t size : blockSizes.asArrayRef()) {
|
||
|
flatSize *= size;
|
||
|
}
|
||
|
StringAttr flatSizeAttr =
|
||
|
StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
|
||
|
op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(),
|
||
|
flatSizeAttr);
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
};
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
|
||
|
target.addIllegalOp<func::FuncOp>();
|
||
|
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
|
||
|
target.addLegalDialect<ROCDL::ROCDLDialect>();
|
||
|
target.addIllegalDialect<gpu::GPUDialect>();
|
||
|
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
|
||
|
LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
|
||
|
LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
|
||
|
LLVM::SqrtOp>();
|
||
|
|
||
|
// TODO: Remove once we support replacing non-root ops.
|
||
|
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
|
||
|
}
|
||
|
|
||
|
template <typename OpTy>
|
||
|
static void populateOpPatterns(LLVMTypeConverter &converter,
|
||
|
RewritePatternSet &patterns, StringRef f32Func,
|
||
|
StringRef f64Func) {
|
||
|
patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
|
||
|
patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
|
||
|
}
|
||
|
|
||
|
void mlir::populateGpuToROCDLConversionPatterns(
|
||
|
LLVMTypeConverter &converter, RewritePatternSet &patterns,
|
||
|
mlir::gpu::amd::Runtime runtime) {
|
||
|
using mlir::gpu::amd::Runtime;
|
||
|
|
||
|
populateWithGenerated(patterns);
|
||
|
patterns
|
||
|
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
|
||
|
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
|
||
|
converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
|
||
|
patterns.add<GPUIndexIntrinsicOpLowering<
|
||
|
gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
|
||
|
converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
|
||
|
patterns
|
||
|
.add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
|
||
|
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
|
||
|
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
|
||
|
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
|
||
|
GPUReturnOpLowering>(converter);
|
||
|
patterns.add<GPUFuncOpLowering>(
|
||
|
converter,
|
||
|
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
|
||
|
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
|
||
|
StringAttr::get(&converter.getContext(),
|
||
|
ROCDL::ROCDLDialect::getKernelFuncAttrName()));
|
||
|
if (Runtime::HIP == runtime) {
|
||
|
patterns.add<GPUPrintfOpToHIPLowering>(converter);
|
||
|
} else if (Runtime::OpenCL == runtime) {
|
||
|
// Use address space = 4 to match the OpenCL definition of printf()
|
||
|
patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
|
||
|
}
|
||
|
// TODO: Add alignment for workgroup memory
|
||
|
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
|
||
|
|
||
|
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
|
||
|
|
||
|
populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",
|
||
|
"__ocml_fabs_f64");
|
||
|
populateOpPatterns<math::AtanOp>(converter, patterns, "__ocml_atan_f32",
|
||
|
"__ocml_atan_f64");
|
||
|
populateOpPatterns<math::Atan2Op>(converter, patterns, "__ocml_atan2_f32",
|
||
|
"__ocml_atan2_f64");
|
||
|
populateOpPatterns<math::CbrtOp>(converter, patterns, "__ocml_cbrt_f32",
|
||
|
"__ocml_cbrt_f64");
|
||
|
populateOpPatterns<math::CeilOp>(converter, patterns, "__ocml_ceil_f32",
|
||
|
"__ocml_ceil_f64");
|
||
|
populateOpPatterns<math::CosOp>(converter, patterns, "__ocml_cos_f32",
|
||
|
"__ocml_cos_f64");
|
||
|
populateOpPatterns<math::ExpOp>(converter, patterns, "__ocml_exp_f32",
|
||
|
"__ocml_exp_f64");
|
||
|
populateOpPatterns<math::Exp2Op>(converter, patterns, "__ocml_exp2_f32",
|
||
|
"__ocml_exp2_f64");
|
||
|
populateOpPatterns<math::ExpM1Op>(converter, patterns, "__ocml_expm1_f32",
|
||
|
"__ocml_expm1_f64");
|
||
|
populateOpPatterns<math::FloorOp>(converter, patterns, "__ocml_floor_f32",
|
||
|
"__ocml_floor_f64");
|
||
|
populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",
|
||
|
"__ocml_fmod_f64");
|
||
|
populateOpPatterns<math::LogOp>(converter, patterns, "__ocml_log_f32",
|
||
|
"__ocml_log_f64");
|
||
|
populateOpPatterns<math::Log10Op>(converter, patterns, "__ocml_log10_f32",
|
||
|
"__ocml_log10_f64");
|
||
|
populateOpPatterns<math::Log1pOp>(converter, patterns, "__ocml_log1p_f32",
|
||
|
"__ocml_log1p_f64");
|
||
|
populateOpPatterns<math::Log2Op>(converter, patterns, "__ocml_log2_f32",
|
||
|
"__ocml_log2_f64");
|
||
|
populateOpPatterns<math::PowFOp>(converter, patterns, "__ocml_pow_f32",
|
||
|
"__ocml_pow_f64");
|
||
|
populateOpPatterns<math::RsqrtOp>(converter, patterns, "__ocml_rsqrt_f32",
|
||
|
"__ocml_rsqrt_f64");
|
||
|
populateOpPatterns<math::SinOp>(converter, patterns, "__ocml_sin_f32",
|
||
|
"__ocml_sin_f64");
|
||
|
populateOpPatterns<math::SqrtOp>(converter, patterns, "__ocml_sqrt_f32",
|
||
|
"__ocml_sqrt_f64");
|
||
|
populateOpPatterns<math::TanhOp>(converter, patterns, "__ocml_tanh_f32",
|
||
|
"__ocml_tanh_f64");
|
||
|
populateOpPatterns<math::TanOp>(converter, patterns, "__ocml_tan_f32",
|
||
|
"__ocml_tan_f64");
|
||
|
populateOpPatterns<math::ErfOp>(converter, patterns, "__ocml_erf_f32",
|
||
|
"__ocml_erf_f64");
|
||
|
}
|
||
|
|
||
|
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
||
|
mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
|
||
|
unsigned indexBitwidth,
|
||
|
bool useBarePtrCallConv,
|
||
|
gpu::amd::Runtime runtime) {
|
||
|
return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
|
||
|
chipset, indexBitwidth, useBarePtrCallConv, runtime);
|
||
|
}
|