bolt/deps/llvm-18.1.8/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to generate ROCDLIR operations for higher-level
// GPU operations.
//
//===----------------------------------------------------------------------===//

#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"

#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"

#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"

namespace mlir {
#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir

using namespace mlir;

/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
  bool canBeBare = true;
  for (Type type : func.getArgumentTypes())
    if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
      canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
  return canBeBare;
}

Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
                const unsigned indexBitwidth) {
  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
  Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);
  Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);
  Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
                                                    ValueRange{minus1, zero});
  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
                                                   ValueRange{minus1, mbcntLo});
  return laneId;
}

namespace {
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
  using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;

  LogicalResult
  matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    auto loc = op->getLoc();
    MLIRContext *context = rewriter.getContext();
    // convert to:  %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
    // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)

    Type intTy = IntegerType::get(context, 32);
    Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);
    Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);
    Value mbcntLo =
        rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
    Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
        loc, intTy, ValueRange{minus1, mbcntLo});
    // Truncate or extend the result depending on the index bitwidth specified
    // by the LLVMTypeConverter options.
    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
    if (indexBitwidth > 32) {
      laneId = rewriter.create<LLVM::SExtOp>(
          loc, IntegerType::get(context, indexBitwidth), laneId);
    } else if (indexBitwidth < 32) {
      laneId = rewriter.create<LLVM::TruncOp>(
          loc, IntegerType::get(context, indexBitwidth), laneId);
    }
    rewriter.replaceOp(op, {laneId});
    return success();
  }
};

struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
  using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;

  /// Lowers a shuffle to the corresponding ROCDL ops.
  ///
  /// Use the `width` argument to see if src lane is participating.
  /// If not the dstLane would be itself.
  ///
  ///  Shuffle with DS Bpermute:
  ///   let shflMode = [xor, up, down, idx]
  ///   let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
  ///   1. curLaneId = using mbcnt.lo + mbcnt.hi
  ///   2. widthOrZeroIfOutside = (curLaneId + width) & -width
  ///   3. dstLane = shflMode(curLaneId, step)
  ///   4. isActiveSrcLane = dstLane < isActiveSrcLane
  ///   5. dstLane = isActiveSrcLane ? dstLane : curLaneId
  ///   6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
  ///   7. bpermute(dwordAlignedDstLane, shfl_value).
  ///
  LogicalResult
  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    Location loc = op->getLoc();
    // TODO: Add support for non 32-bit shuffle values.
    if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
      return failure();
    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
    Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);

    auto int32Type = IntegerType::get(rewriter.getContext(), 32);
    Value width = adaptor.getWidth();
    Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
    Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
    Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
    Value widthOrZeroIfOutside =
        rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
    Value dstLane;
    // TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN.
    // TODO: Use ds_swizzle for XOR when step/offsets are constants for better
    // perf.
    switch (op.getMode()) {
    case gpu::ShuffleMode::XOR:
      dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
                                             adaptor.getOffset());
      break;
    case gpu::ShuffleMode::IDX:
      dstLane = adaptor.getOffset();
      break;
    default:
      return failure();
    }
    Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
        loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
    Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
                                                          dstLane, srcLaneId);
    Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
    Value dwordAlignedDstLane =
        rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
    Value initShflValue = adaptor.getValue();
    if (adaptor.getValue().getType().isF32()) {
      initShflValue =
          rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
    }
    Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
        loc, int32Type, dwordAlignedDstLane, initShflValue);
    if (adaptor.getValue().getType().isF32()) {
      shflValue = rewriter.create<LLVM::BitcastOp>(
          loc, adaptor.getValue().getType(), shflValue);
    }
    rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
    return success();
  }
};

/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"

// A pass that replaces all occurrences of GPU device operations with their
// corresponding ROCDL equivalent.
//
// This pass only handles device code and is not meant to be run on GPU host
// code.
struct LowerGpuOpsToROCDLOpsPass
    : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
  LowerGpuOpsToROCDLOpsPass() = default;
  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
                            bool useBarePtrCallConv,
                            gpu::amd::Runtime runtime) {
    if (this->chipset.getNumOccurrences() == 0)
      this->chipset = chipset;
    if (this->indexBitwidth.getNumOccurrences() == 0)
      this->indexBitwidth = indexBitwidth;
    if (this->useBarePtrCallConv.getNumOccurrences() == 0)
      this->useBarePtrCallConv = useBarePtrCallConv;
    if (this->runtime.getNumOccurrences() == 0)
      this->runtime = runtime;
  }

  void runOnOperation() override {
    gpu::GPUModuleOp m = getOperation();
    MLIRContext *ctx = m.getContext();

    // Request C wrapper emission.
    for (auto func : m.getOps<func::FuncOp>()) {
      func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
                    UnitAttr::get(ctx));
    }

    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
    if (failed(maybeChipset)) {
      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
      return signalPassFailure();
    }

    /// Customize the bitwidth used for the device side index computations.
    LowerToLLVMOptions options(
        ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
    if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
      options.overrideIndexBitwidth(indexBitwidth);

    if (useBarePtrCallConv) {
      options.useBarePtrCallConv = true;
      WalkResult canUseBarePointers =
          m.walk([](gpu::GPUFuncOp func) -> WalkResult {
            if (canBeCalledWithBarePointers(func))
              return WalkResult::advance();
            return WalkResult::interrupt();
          });
      if (canUseBarePointers.wasInterrupted()) {
        emitError(UnknownLoc::get(ctx),
                  "bare pointer calling convention requires all memrefs to "
                  "have static shape and use the identity map");
        return signalPassFailure();
      }
    }

    // Apply in-dialect lowering. In-dialect lowering will replace
    // ops which need to be lowered further, which is not supported by a
    // single conversion pass.
    {
      RewritePatternSet patterns(ctx);
      populateGpuRewritePatterns(patterns);
      arith::populateExpandBFloat16Patterns(patterns);
      (void)applyPatternsAndFoldGreedily(m, std::move(patterns));
    }

    LLVMTypeConverter converter(ctx, options);
    populateGpuMemorySpaceAttributeConversions(
        converter, [](gpu::AddressSpace space) {
          switch (space) {
          case gpu::AddressSpace::Global:
            return 1;
          case gpu::AddressSpace::Workgroup:
            return 3;
          case gpu::AddressSpace::Private:
            return 5;
          }
          llvm_unreachable("unknown address space enum value");
          return 0;
        });

    RewritePatternSet llvmPatterns(ctx);

    mlir::arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
    populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
                                            *maybeChipset);
    populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
    cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
    populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
    populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
    populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
    LLVMConversionTarget target(getContext());
    configureGpuToROCDLConversionLegality(target);
    if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
      signalPassFailure();

    // Manually rewrite known block size attributes so the LLVMIR translation
    // infrastructure can pick them up.
    m.walk([ctx](LLVM::LLVMFuncOp op) {
      if (auto blockSizes = dyn_cast_or_null<DenseI32ArrayAttr>(
              op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()))) {
        op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(),
                    blockSizes);
        // Also set up the rocdl.flat_work_group_size attribute to prevent
        // conflicting metadata.
        uint32_t flatSize = 1;
        for (uint32_t size : blockSizes.asArrayRef()) {
          flatSize *= size;
        }
        StringAttr flatSizeAttr =
            StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
        op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(),
                    flatSizeAttr);
      }
    });
  }
};

} // namespace

void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
  target.addIllegalOp<func::FuncOp>();
  target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
  target.addLegalDialect<ROCDL::ROCDLDialect>();
  target.addIllegalDialect<gpu::GPUDialect>();
  target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
                      LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
                      LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
                      LLVM::SqrtOp>();

  // TODO: Remove once we support replacing non-root ops.
  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
}

template <typename OpTy>
static void populateOpPatterns(LLVMTypeConverter &converter,
                               RewritePatternSet &patterns, StringRef f32Func,
                               StringRef f64Func) {
  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);
}

void mlir::populateGpuToROCDLConversionPatterns(
    LLVMTypeConverter &converter, RewritePatternSet &patterns,
    mlir::gpu::amd::Runtime runtime) {
  using mlir::gpu::amd::Runtime;

  populateWithGenerated(patterns);
  patterns
      .add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
                                       ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
          converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
  patterns.add<GPUIndexIntrinsicOpLowering<
      gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
      converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
  patterns
      .add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
                                       ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
           GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
                                       ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
           GPUReturnOpLowering>(converter);
  patterns.add<GPUFuncOpLowering>(
      converter,
      /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
      /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
      StringAttr::get(&converter.getContext(),
                      ROCDL::ROCDLDialect::getKernelFuncAttrName()));
  if (Runtime::HIP == runtime) {
    patterns.add<GPUPrintfOpToHIPLowering>(converter);
  } else if (Runtime::OpenCL == runtime) {
    // Use address space = 4 to match the OpenCL definition of printf()
    patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
  }
  // TODO: Add alignment for workgroup memory
  patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);

  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);

  populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",
                                   "__ocml_fabs_f64");
  populateOpPatterns<math::AtanOp>(converter, patterns, "__ocml_atan_f32",
                                   "__ocml_atan_f64");
  populateOpPatterns<math::Atan2Op>(converter, patterns, "__ocml_atan2_f32",
                                    "__ocml_atan2_f64");
  populateOpPatterns<math::CbrtOp>(converter, patterns, "__ocml_cbrt_f32",
                                   "__ocml_cbrt_f64");
  populateOpPatterns<math::CeilOp>(converter, patterns, "__ocml_ceil_f32",
                                   "__ocml_ceil_f64");
  populateOpPatterns<math::CosOp>(converter, patterns, "__ocml_cos_f32",
                                  "__ocml_cos_f64");
  populateOpPatterns<math::ExpOp>(converter, patterns, "__ocml_exp_f32",
                                  "__ocml_exp_f64");
  populateOpPatterns<math::Exp2Op>(converter, patterns, "__ocml_exp2_f32",
                                   "__ocml_exp2_f64");
  populateOpPatterns<math::ExpM1Op>(converter, patterns, "__ocml_expm1_f32",
                                    "__ocml_expm1_f64");
  populateOpPatterns<math::FloorOp>(converter, patterns, "__ocml_floor_f32",
                                    "__ocml_floor_f64");
  populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",
                                    "__ocml_fmod_f64");
  populateOpPatterns<math::LogOp>(converter, patterns, "__ocml_log_f32",
                                  "__ocml_log_f64");
  populateOpPatterns<math::Log10Op>(converter, patterns, "__ocml_log10_f32",
                                    "__ocml_log10_f64");
  populateOpPatterns<math::Log1pOp>(converter, patterns, "__ocml_log1p_f32",
                                    "__ocml_log1p_f64");
  populateOpPatterns<math::Log2Op>(converter, patterns, "__ocml_log2_f32",
                                   "__ocml_log2_f64");
  populateOpPatterns<math::PowFOp>(converter, patterns, "__ocml_pow_f32",
                                   "__ocml_pow_f64");
  populateOpPatterns<math::RsqrtOp>(converter, patterns, "__ocml_rsqrt_f32",
                                    "__ocml_rsqrt_f64");
  populateOpPatterns<math::SinOp>(converter, patterns, "__ocml_sin_f32",
                                  "__ocml_sin_f64");
  populateOpPatterns<math::SqrtOp>(converter, patterns, "__ocml_sqrt_f32",
                                   "__ocml_sqrt_f64");
  populateOpPatterns<math::TanhOp>(converter, patterns, "__ocml_tanh_f32",
                                   "__ocml_tanh_f64");
  populateOpPatterns<math::TanOp>(converter, patterns, "__ocml_tan_f32",
                                  "__ocml_tan_f64");
  populateOpPatterns<math::ErfOp>(converter, patterns, "__ocml_erf_f32",
                                  "__ocml_erf_f64");
}

std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
                                      unsigned indexBitwidth,
                                      bool useBarePtrCallConv,
                                      gpu::amd::Runtime runtime) {
  return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
      chipset, indexBitwidth, useBarePtrCallConv, runtime);
}
Embed LLVM 18.1.8 2025-02-14 19:21:04 +01:00			`//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This file implements a pass to generate ROCDLIR operations for higher-level`
			`// GPU operations.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"`
			`#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"`
			`#include "mlir/Dialect/Arith/Transforms/Passes.h"`
			`#include "mlir/Pass/Pass.h"`
			`#include "mlir/Pass/PassManager.h"`
			`#include "mlir/Transforms/Passes.h"`

			`#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"`
			`#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"`
			`#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"`
			`#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"`
			`#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"`
			`#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"`
			`#include "mlir/Conversion/LLVMCommon/Pattern.h"`
			`#include "mlir/Conversion/LLVMCommon/TypeConverter.h"`
			`#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"`
			`#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"`
			`#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"`
			`#include "mlir/Dialect/Func/IR/FuncOps.h"`
			`#include "mlir/Dialect/GPU/IR/GPUDialect.h"`
			`#include "mlir/Dialect/GPU/Transforms/Passes.h"`
			`#include "mlir/Dialect/LLVMIR/LLVMDialect.h"`
			`#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"`
			`#include "mlir/Dialect/Math/IR/Math.h"`
			`#include "mlir/Dialect/MemRef/IR/MemRef.h"`
			`#include "mlir/Dialect/Vector/IR/VectorOps.h"`
			`#include "mlir/IR/BuiltinAttributes.h"`
			`#include "mlir/Pass/Pass.h"`
			`#include "mlir/Transforms/DialectConversion.h"`
			`#include "mlir/Transforms/GreedyPatternRewriteDriver.h"`
			`#include "llvm/Support/FormatVariadic.h"`

			`#include "../GPUCommon/GPUOpsLowering.h"`
			`#include "../GPUCommon/IndexIntrinsicsOpLowering.h"`
			`#include "../GPUCommon/OpToFuncCallLowering.h"`

			`namespace mlir {`
			`#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS`
			`#include "mlir/Conversion/Passes.h.inc"`
			`} // namespace mlir`

			`using namespace mlir;`

			/// Returns true if the given `gpu.func` can be safely called using the bare
			`/// pointer calling convention.`
			`static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {`
			`bool canBeBare = true;`
			`for (Type type : func.getArgumentTypes())`
			`if (auto memrefTy = dyn_cast<BaseMemRefType>(type))`
			`canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);`
			`return canBeBare;`
			`}`

			`Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,`
			`const unsigned indexBitwidth) {`
			`auto int32Type = IntegerType::get(rewriter.getContext(), 32);`
			`Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);`
			`Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);`
			`Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,`
			`ValueRange{minus1, zero});`
			`Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,`
			`ValueRange{minus1, mbcntLo});`
			`return laneId;`
			`}`

			`namespace {`
			`struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {`
			`using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;`

			`LogicalResult`
			`matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`auto loc = op->getLoc();`
			`MLIRContext *context = rewriter.getContext();`
			`// convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)`
			`// followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)`

			`Type intTy = IntegerType::get(context, 32);`
			`Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);`
			`Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);`
			`Value mbcntLo =`
			`rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});`
			`Value laneId = rewriter.create<ROCDL::MbcntHiOp>(`
			`loc, intTy, ValueRange{minus1, mbcntLo});`
			`// Truncate or extend the result depending on the index bitwidth specified`
			`// by the LLVMTypeConverter options.`
			`const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();`
			`if (indexBitwidth > 32) {`
			`laneId = rewriter.create<LLVM::SExtOp>(`
			`loc, IntegerType::get(context, indexBitwidth), laneId);`
			`} else if (indexBitwidth < 32) {`
			`laneId = rewriter.create<LLVM::TruncOp>(`
			`loc, IntegerType::get(context, indexBitwidth), laneId);`
			`}`
			`rewriter.replaceOp(op, {laneId});`
			`return success();`
			`}`
			`};`

			`struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {`
			`using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;`

			`/// Lowers a shuffle to the corresponding ROCDL ops.`
			`///`
			/// Use the `width` argument to see if src lane is participating.
			`/// If not the dstLane would be itself.`
			`///`
			`/// Shuffle with DS Bpermute:`
			`/// let shflMode = [xor, up, down, idx]`
			`/// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].`
			`/// 1. curLaneId = using mbcnt.lo + mbcnt.hi`
			`/// 2. widthOrZeroIfOutside = (curLaneId + width) & -width`
			`/// 3. dstLane = shflMode(curLaneId, step)`
			`/// 4. isActiveSrcLane = dstLane < isActiveSrcLane`
			`/// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId`
			`/// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.`
			`/// 7. bpermute(dwordAlignedDstLane, shfl_value).`
			`///`
			`LogicalResult`
			`matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`Location loc = op->getLoc();`
			`// TODO: Add support for non 32-bit shuffle values.`
			`if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)`
			`return failure();`
			`const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();`
			`Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);`

			`auto int32Type = IntegerType::get(rewriter.getContext(), 32);`
			`Value width = adaptor.getWidth();`
			`Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);`
			`Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);`
			`Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);`
			`Value widthOrZeroIfOutside =`
			`rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);`
			`Value dstLane;`
			`// TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN.`
			`// TODO: Use ds_swizzle for XOR when step/offsets are constants for better`
			`// perf.`
			`switch (op.getMode()) {`
			`case gpu::ShuffleMode::XOR:`
			`dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,`
			`adaptor.getOffset());`
			`break;`
			`case gpu::ShuffleMode::IDX:`
			`dstLane = adaptor.getOffset();`
			`break;`
			`default:`
			`return failure();`
			`}`
			`Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(`
			`loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);`
			`Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,`
			`dstLane, srcLaneId);`
			`Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);`
			`Value dwordAlignedDstLane =`
			`rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);`
			`Value initShflValue = adaptor.getValue();`
			`if (adaptor.getValue().getType().isF32()) {`
			`initShflValue =`
			`rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);`
			`}`
			`Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(`
			`loc, int32Type, dwordAlignedDstLane, initShflValue);`
			`if (adaptor.getValue().getType().isF32()) {`
			`shflValue = rewriter.create<LLVM::BitcastOp>(`
			`loc, adaptor.getValue().getType(), shflValue);`
			`}`
			`rewriter.replaceOp(op, {shflValue, isActiveSrcLane});`
			`return success();`
			`}`
			`};`

			`/// Import the GPU Ops to ROCDL Patterns.`
			`#include "GPUToROCDL.cpp.inc"`

			`// A pass that replaces all occurrences of GPU device operations with their`
			`// corresponding ROCDL equivalent.`
			`//`
			`// This pass only handles device code and is not meant to be run on GPU host`
			`// code.`
			`struct LowerGpuOpsToROCDLOpsPass`
			`: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {`
			`LowerGpuOpsToROCDLOpsPass() = default;`
			`LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,`
			`bool useBarePtrCallConv,`
			`gpu::amd::Runtime runtime) {`
			`if (this->chipset.getNumOccurrences() == 0)`
			`this->chipset = chipset;`
			`if (this->indexBitwidth.getNumOccurrences() == 0)`
			`this->indexBitwidth = indexBitwidth;`
			`if (this->useBarePtrCallConv.getNumOccurrences() == 0)`
			`this->useBarePtrCallConv = useBarePtrCallConv;`
			`if (this->runtime.getNumOccurrences() == 0)`
			`this->runtime = runtime;`
			`}`

			`void runOnOperation() override {`
			`gpu::GPUModuleOp m = getOperation();`
			`MLIRContext *ctx = m.getContext();`

			`// Request C wrapper emission.`
			`for (auto func : m.getOps<func::FuncOp>()) {`
			`func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),`
			`UnitAttr::get(ctx));`
			`}`

			`FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);`
			`if (failed(maybeChipset)) {`
			`emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);`
			`return signalPassFailure();`
			`}`

			`/// Customize the bitwidth used for the device side index computations.`
			`LowerToLLVMOptions options(`
			`ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));`
			`if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)`
			`options.overrideIndexBitwidth(indexBitwidth);`

			`if (useBarePtrCallConv) {`
			`options.useBarePtrCallConv = true;`
			`WalkResult canUseBarePointers =`
			`m.walk([](gpu::GPUFuncOp func) -> WalkResult {`
			`if (canBeCalledWithBarePointers(func))`
			`return WalkResult::advance();`
			`return WalkResult::interrupt();`
			`});`
			`if (canUseBarePointers.wasInterrupted()) {`
			`emitError(UnknownLoc::get(ctx),`
			`"bare pointer calling convention requires all memrefs to "`
			`"have static shape and use the identity map");`
			`return signalPassFailure();`
			`}`
			`}`

			`// Apply in-dialect lowering. In-dialect lowering will replace`
			`// ops which need to be lowered further, which is not supported by a`
			`// single conversion pass.`
			`{`
			`RewritePatternSet patterns(ctx);`
			`populateGpuRewritePatterns(patterns);`
			`arith::populateExpandBFloat16Patterns(patterns);`
			`(void)applyPatternsAndFoldGreedily(m, std::move(patterns));`
			`}`

			`LLVMTypeConverter converter(ctx, options);`
			`populateGpuMemorySpaceAttributeConversions(`
			`converter, [](gpu::AddressSpace space) {`
			`switch (space) {`
			`case gpu::AddressSpace::Global:`
			`return 1;`
			`case gpu::AddressSpace::Workgroup:`
			`return 3;`
			`case gpu::AddressSpace::Private:`
			`return 5;`
			`}`
			`llvm_unreachable("unknown address space enum value");`
			`return 0;`
			`});`

			`RewritePatternSet llvmPatterns(ctx);`

			`mlir::arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);`
			`populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,`
			`*maybeChipset);`
			`populateVectorToLLVMConversionPatterns(converter, llvmPatterns);`
			`cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);`
			`populateFuncToLLVMConversionPatterns(converter, llvmPatterns);`
			`populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);`
			`populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);`
			`LLVMConversionTarget target(getContext());`
			`configureGpuToROCDLConversionLegality(target);`
			`if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))`
			`signalPassFailure();`

			`// Manually rewrite known block size attributes so the LLVMIR translation`
			`// infrastructure can pick them up.`
			`m.walk([ctx](LLVM::LLVMFuncOp op) {`
			`if (auto blockSizes = dyn_cast_or_null<DenseI32ArrayAttr>(`
			`op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()))) {`
			`op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(),`
			`blockSizes);`
			`// Also set up the rocdl.flat_work_group_size attribute to prevent`
			`// conflicting metadata.`
			`uint32_t flatSize = 1;`
			`for (uint32_t size : blockSizes.asArrayRef()) {`
			`flatSize *= size;`
			`}`
			`StringAttr flatSizeAttr =`
			`StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));`
			`op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(),`
			`flatSizeAttr);`
			`}`
			`});`
			`}`
			`};`

			`} // namespace`

			`void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {`
			`target.addIllegalOp<func::FuncOp>();`
			`target.addLegalDialect<::mlir::LLVM::LLVMDialect>();`
			`target.addLegalDialect<ROCDL::ROCDLDialect>();`
			`target.addIllegalDialect<gpu::GPUDialect>();`
			`target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,`
			`LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,`
			`LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,`
			`LLVM::SqrtOp>();`

			`// TODO: Remove once we support replacing non-root ops.`
			`target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();`
			`}`

			`template <typename OpTy>`
			`static void populateOpPatterns(LLVMTypeConverter &converter,`
			`RewritePatternSet &patterns, StringRef f32Func,`
			`StringRef f64Func) {`
			`patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);`
			`patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func);`
			`}`

			`void mlir::populateGpuToROCDLConversionPatterns(`
			`LLVMTypeConverter &converter, RewritePatternSet &patterns,`
			`mlir::gpu::amd::Runtime runtime) {`
			`using mlir::gpu::amd::Runtime;`

			`populateWithGenerated(patterns);`
			`patterns`
			`.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,`
			`ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(`
			`converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());`
			`patterns.add<GPUIndexIntrinsicOpLowering<`
			`gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(`
			`converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());`
			`patterns`
			`.add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,`
			`ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,`
			`GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,`
			`ROCDL::GridDimYOp, ROCDL::GridDimZOp>,`
			`GPUReturnOpLowering>(converter);`
			`patterns.add<GPUFuncOpLowering>(`
			`converter,`
			`/allocaAddrSpace=/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,`
			`/workgroupAddrSpace=/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,`
			`StringAttr::get(&converter.getContext(),`
			`ROCDL::ROCDLDialect::getKernelFuncAttrName()));`
			`if (Runtime::HIP == runtime) {`
			`patterns.add<GPUPrintfOpToHIPLowering>(converter);`
			`} else if (Runtime::OpenCL == runtime) {`
			`// Use address space = 4 to match the OpenCL definition of printf()`
			`patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /addressSpace=/4);`
			`}`
			`// TODO: Add alignment for workgroup memory`
			`patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);`

			`patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);`

			`populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",`
			`"__ocml_fabs_f64");`
			`populateOpPatterns<math::AtanOp>(converter, patterns, "__ocml_atan_f32",`
			`"__ocml_atan_f64");`
			`populateOpPatterns<math::Atan2Op>(converter, patterns, "__ocml_atan2_f32",`
			`"__ocml_atan2_f64");`
			`populateOpPatterns<math::CbrtOp>(converter, patterns, "__ocml_cbrt_f32",`
			`"__ocml_cbrt_f64");`
			`populateOpPatterns<math::CeilOp>(converter, patterns, "__ocml_ceil_f32",`
			`"__ocml_ceil_f64");`
			`populateOpPatterns<math::CosOp>(converter, patterns, "__ocml_cos_f32",`
			`"__ocml_cos_f64");`
			`populateOpPatterns<math::ExpOp>(converter, patterns, "__ocml_exp_f32",`
			`"__ocml_exp_f64");`
			`populateOpPatterns<math::Exp2Op>(converter, patterns, "__ocml_exp2_f32",`
			`"__ocml_exp2_f64");`
			`populateOpPatterns<math::ExpM1Op>(converter, patterns, "__ocml_expm1_f32",`
			`"__ocml_expm1_f64");`
			`populateOpPatterns<math::FloorOp>(converter, patterns, "__ocml_floor_f32",`
			`"__ocml_floor_f64");`
			`populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",`
			`"__ocml_fmod_f64");`
			`populateOpPatterns<math::LogOp>(converter, patterns, "__ocml_log_f32",`
			`"__ocml_log_f64");`
			`populateOpPatterns<math::Log10Op>(converter, patterns, "__ocml_log10_f32",`
			`"__ocml_log10_f64");`
			`populateOpPatterns<math::Log1pOp>(converter, patterns, "__ocml_log1p_f32",`
			`"__ocml_log1p_f64");`
			`populateOpPatterns<math::Log2Op>(converter, patterns, "__ocml_log2_f32",`
			`"__ocml_log2_f64");`
			`populateOpPatterns<math::PowFOp>(converter, patterns, "__ocml_pow_f32",`
			`"__ocml_pow_f64");`
			`populateOpPatterns<math::RsqrtOp>(converter, patterns, "__ocml_rsqrt_f32",`
			`"__ocml_rsqrt_f64");`
			`populateOpPatterns<math::SinOp>(converter, patterns, "__ocml_sin_f32",`
			`"__ocml_sin_f64");`
			`populateOpPatterns<math::SqrtOp>(converter, patterns, "__ocml_sqrt_f32",`
			`"__ocml_sqrt_f64");`
			`populateOpPatterns<math::TanhOp>(converter, patterns, "__ocml_tanh_f32",`
			`"__ocml_tanh_f64");`
			`populateOpPatterns<math::TanOp>(converter, patterns, "__ocml_tan_f32",`
			`"__ocml_tan_f64");`
			`populateOpPatterns<math::ErfOp>(converter, patterns, "__ocml_erf_f32",`
			`"__ocml_erf_f64");`
			`}`

			`std::unique_ptr<OperationPass<gpu::GPUModuleOp>>`
			`mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,`
			`unsigned indexBitwidth,`
			`bool useBarePtrCallConv,`
			`gpu::amd::Runtime runtime) {`
			`return std::make_unique<LowerGpuOpsToROCDLOpsPass>(`
			`chipset, indexBitwidth, useBarePtrCallConv, runtime);`
			`}`