//===- LoopVersioning.cpp -------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// /// \file /// This pass looks for loops iterating over assumed-shape arrays, that can /// be optimized by "guessing" that the stride is element-sized. /// /// This is done by creating two versions of the same loop: one which assumes /// that the elements are contiguous (stride == size of element), and one that /// is the original generic loop. /// /// As a side-effect of the assumed element size stride, the array is also /// flattened to make it a 1D array - this is because the internal array /// structure must be either 1D or have known sizes in all dimensions - and at /// least one of the dimensions here is already unknown. /// /// There are two distinct benefits here: /// 1. The loop that iterates over the elements is somewhat simplified by the /// constant stride calculation. /// 2. Since the compiler can understand the size of the stride, it can use /// vector instructions, where an unknown (at compile time) stride does often /// prevent vector operations from being used. /// /// A known drawback is that the code-size is increased, in some cases that can /// be quite substantial - 3-4x is quite plausible (this includes that the loop /// gets vectorized, which in itself often more than doubles the size of the /// code, because unless the loop size is known, there will be a modulo /// vector-size remainder to deal with. /// /// TODO: Do we need some size limit where loops no longer get duplicated? // Maybe some sort of cost analysis. /// TODO: Should some loop content - for example calls to functions and /// subroutines inhibit the versioning of the loops. Plausibly, this /// could be part of the cost analysis above. //===----------------------------------------------------------------------===// #include "flang/ISO_Fortran_binding_wrapper.h" #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/Inquiry.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include namespace fir { #define GEN_PASS_DEF_LOOPVERSIONING #include "flang/Optimizer/Transforms/Passes.h.inc" } // namespace fir #define DEBUG_TYPE "flang-loop-versioning" namespace { class LoopVersioningPass : public fir::impl::LoopVersioningBase { public: void runOnOperation() override; }; /// @struct ArgInfo /// A structure to hold an argument, the size of the argument and dimension /// information. struct ArgInfo { mlir::Value arg; size_t size; unsigned rank; fir::BoxDimsOp dims[CFI_MAX_RANK]; }; /// @struct ArgsUsageInLoop /// A structure providing information about the function arguments /// usage by the instructions immediately nested in a loop. struct ArgsUsageInLoop { /// Mapping between the memref operand of an array indexing /// operation (e.g. fir.coordinate_of) and the argument information. llvm::DenseMap usageInfo; /// Some array indexing operations inside a loop cannot be transformed. /// This vector holds the memref operands of such operations. /// The vector is used to make sure that we do not try to transform /// any outer loop, since this will imply the operation rewrite /// in this loop. llvm::SetVector cannotTransform; // Debug dump of the structure members assuming that // the information has been collected for the given loop. void dump(fir::DoLoopOp loop) const { LLVM_DEBUG({ mlir::OpPrintingFlags printFlags; printFlags.skipRegions(); llvm::dbgs() << "Arguments usage info for loop:\n"; loop.print(llvm::dbgs(), printFlags); llvm::dbgs() << "\nUsed args:\n"; for (auto &use : usageInfo) { mlir::Value v = use.first; v.print(llvm::dbgs(), printFlags); llvm::dbgs() << "\n"; } llvm::dbgs() << "\nCannot transform args:\n"; for (mlir::Value arg : cannotTransform) { arg.print(llvm::dbgs(), printFlags); llvm::dbgs() << "\n"; } llvm::dbgs() << "====\n"; }); } // Erase usageInfo and cannotTransform entries for a set // of given arguments. void eraseUsage(const llvm::SetVector &args) { for (auto &arg : args) usageInfo.erase(arg); cannotTransform.set_subtract(args); } // Erase usageInfo and cannotTransform entries for a set // of given arguments provided in the form of usageInfo map. void eraseUsage(const llvm::DenseMap &args) { for (auto &arg : args) { usageInfo.erase(arg.first); cannotTransform.remove(arg.first); } } }; } // namespace static fir::SequenceType getAsSequenceType(mlir::Value *v) { mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType())); return argTy.dyn_cast(); } /// if a value comes from a fir.declare, follow it to the original source, /// otherwise return the value static mlir::Value unwrapFirDeclare(mlir::Value val) { // fir.declare is for source code variables. We don't have declares of // declares if (fir::DeclareOp declare = val.getDefiningOp()) return declare.getMemref(); return val; } /// if a value comes from a fir.rebox, follow the rebox to the original source, /// of the value, otherwise return the value static mlir::Value unwrapReboxOp(mlir::Value val) { // don't support reboxes of reboxes if (fir::ReboxOp rebox = val.getDefiningOp()) val = rebox.getBox(); return val; } /// normalize a value (removing fir.declare and fir.rebox) so that we can /// more conveniently spot values which came from function arguments static mlir::Value normaliseVal(mlir::Value val) { return unwrapFirDeclare(unwrapReboxOp(val)); } /// some FIR operations accept a fir.shape, a fir.shift or a fir.shapeshift. /// fir.shift and fir.shapeshift allow us to extract lower bounds /// if lowerbounds cannot be found, return nullptr static mlir::Value tryGetLowerBoundsFromShapeLike(mlir::Value shapeLike, unsigned dim) { mlir::Value lowerBound{nullptr}; if (auto shift = shapeLike.getDefiningOp()) lowerBound = shift.getOrigins()[dim]; if (auto shapeShift = shapeLike.getDefiningOp()) lowerBound = shapeShift.getOrigins()[dim]; return lowerBound; } /// attempt to get the array lower bounds of dimension dim of the memref /// argument to a fir.array_coor op /// 0 <= dim < rank /// May return nullptr if no lower bounds can be determined static mlir::Value getLowerBound(fir::ArrayCoorOp coop, unsigned dim) { // 1) try to get from the shape argument to fir.array_coor if (mlir::Value shapeLike = coop.getShape()) if (mlir::Value lb = tryGetLowerBoundsFromShapeLike(shapeLike, dim)) return lb; // It is important not to try to read the lower bound from the box, because // in the FIR lowering, boxes will sometimes contain incorrect lower bound // information // out of ideas return {}; } /// gets the i'th index from array coordinate operation op /// dim should range between 0 and rank - 1 static mlir::Value getIndex(fir::FirOpBuilder &builder, mlir::Operation *op, unsigned dim) { if (fir::CoordinateOp coop = mlir::dyn_cast(op)) return coop.getCoor()[dim]; fir::ArrayCoorOp coop = mlir::dyn_cast(op); assert(coop && "operation must be either fir.coordiante_of or fir.array_coor"); // fir.coordinate_of indices start at 0: adjust these indices to match by // subtracting the lower bound mlir::Value index = coop.getIndices()[dim]; mlir::Value lb = getLowerBound(coop, dim); if (!lb) // assume a default lower bound of one lb = builder.createIntegerConstant(coop.getLoc(), index.getType(), 1); // index_0 = index - lb; if (lb.getType() != index.getType()) lb = builder.createConvert(coop.getLoc(), index.getType(), lb); return builder.create(coop.getLoc(), index, lb); } void LoopVersioningPass::runOnOperation() { LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n"); mlir::func::FuncOp func = getOperation(); // First look for arguments with assumed shape = unknown extent in the lowest // dimension. LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n"); mlir::Block::BlockArgListType args = func.getArguments(); mlir::ModuleOp module = func->getParentOfType(); fir::KindMapping kindMap = fir::getKindMapping(module); mlir::SmallVector argsOfInterest; for (auto &arg : args) { // Optional arguments must be checked for IsPresent before // looking for the bounds. They are unsupported for the time being. if (func.getArgAttrOfType(arg.getArgNumber(), fir::getOptionalAttrName())) { LLVM_DEBUG(llvm::dbgs() << "OPTIONAL is not supported\n"); continue; } if (auto seqTy = getAsSequenceType(&arg)) { unsigned rank = seqTy.getDimension(); if (rank > 0 && seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) { size_t typeSize = 0; mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType()); if (elementType.isa() || elementType.isa()) typeSize = elementType.getIntOrFloatBitWidth() / 8; else if (auto cty = elementType.dyn_cast()) typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8; if (typeSize) argsOfInterest.push_back({arg, typeSize, rank, {}}); else LLVM_DEBUG(llvm::dbgs() << "Type not supported\n"); } } } if (argsOfInterest.empty()) { LLVM_DEBUG(llvm::dbgs() << "No suitable arguments.\n=== End " DEBUG_TYPE " ===\n"); return; } // A list of all loops in the function in post-order. mlir::SmallVector originalLoops; // Information about the arguments usage by the instructions // immediately nested in a loop. llvm::DenseMap argsInLoops; auto &domInfo = getAnalysis(); // Traverse the loops in post-order and see // if those arguments are used inside any loop. func.walk([&](fir::DoLoopOp loop) { mlir::Block &body = *loop.getBody(); auto &argsInLoop = argsInLoops[loop]; originalLoops.push_back(loop); body.walk([&](mlir::Operation *op) { // Support either fir.array_coor or fir.coordinate_of. if (!mlir::isa(op)) return; // Process only operations immediately nested in the current loop. if (op->getParentOfType() != loop) return; mlir::Value operand = op->getOperand(0); for (auto a : argsOfInterest) { if (a.arg == normaliseVal(operand)) { // Use the reboxed value, not the block arg when re-creating the loop. a.arg = operand; // Check that the operand dominates the loop? // If this is the case, record such operands in argsInLoop.cannot- // Transform, so that they disable the transformation for the parent /// loops as well. if (!domInfo.dominates(a.arg, loop)) argsInLoop.cannotTransform.insert(a.arg); // No support currently for sliced arrays. // This means that we cannot transform properly // instructions referencing a.arg in the whole loop // nest this loop is located in. if (auto arrayCoor = mlir::dyn_cast(op)) if (arrayCoor.getSlice()) argsInLoop.cannotTransform.insert(a.arg); if (argsInLoop.cannotTransform.contains(a.arg)) { // Remove any previously recorded usage, if any. argsInLoop.usageInfo.erase(a.arg); break; } // Record the a.arg usage, if not recorded yet. argsInLoop.usageInfo.try_emplace(a.arg, a); break; } } }); }); // Dump loops info after initial collection. LLVM_DEBUG({ llvm::dbgs() << "Initial usage info:\n"; for (fir::DoLoopOp loop : originalLoops) { auto &argsInLoop = argsInLoops[loop]; argsInLoop.dump(loop); } }); // Clear argument usage for parent loops if an inner loop // contains a non-transformable usage. for (fir::DoLoopOp loop : originalLoops) { auto &argsInLoop = argsInLoops[loop]; if (argsInLoop.cannotTransform.empty()) continue; fir::DoLoopOp parent = loop; while ((parent = parent->getParentOfType())) argsInLoops[parent].eraseUsage(argsInLoop.cannotTransform); } // If an argument access can be optimized in a loop and // its descendant loop, then it does not make sense to // generate the contiguity check for the descendant loop. // The check will be produced as part of the ancestor // loop's transformation. So we can clear the argument // usage for all descendant loops. for (fir::DoLoopOp loop : originalLoops) { auto &argsInLoop = argsInLoops[loop]; if (argsInLoop.usageInfo.empty()) continue; loop.getBody()->walk([&](fir::DoLoopOp dloop) { argsInLoops[dloop].eraseUsage(argsInLoop.usageInfo); }); } LLVM_DEBUG({ llvm::dbgs() << "Final usage info:\n"; for (fir::DoLoopOp loop : originalLoops) { auto &argsInLoop = argsInLoops[loop]; argsInLoop.dump(loop); } }); // Reduce the collected information to a list of loops // with attached arguments usage information. // The list must hold the loops in post order, so that // the inner loops are transformed before the outer loops. struct OpsWithArgs { mlir::Operation *op; mlir::SmallVector argsAndDims; }; mlir::SmallVector loopsOfInterest; for (fir::DoLoopOp loop : originalLoops) { auto &argsInLoop = argsInLoops[loop]; if (argsInLoop.usageInfo.empty()) continue; OpsWithArgs info; info.op = loop; for (auto &arg : argsInLoop.usageInfo) info.argsAndDims.push_back(arg.second); loopsOfInterest.emplace_back(std::move(info)); } if (loopsOfInterest.empty()) { LLVM_DEBUG(llvm::dbgs() << "No loops to transform.\n=== End " DEBUG_TYPE " ===\n"); return; } // If we get here, there are loops to process. fir::FirOpBuilder builder{module, std::move(kindMap)}; mlir::Location loc = builder.getUnknownLoc(); mlir::IndexType idxTy = builder.getIndexType(); LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:"); LLVM_DEBUG(module->dump()); LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size() << "\n"); for (auto op : loopsOfInterest) { LLVM_DEBUG(op.op->dump()); builder.setInsertionPoint(op.op); mlir::Value allCompares = nullptr; // Ensure all of the arrays are unit-stride. for (auto &arg : op.argsAndDims) { // Fetch all the dimensions of the array, except the last dimension. // Always fetch the first dimension, however, so set ndims = 1 if // we have one dim unsigned ndims = arg.rank; for (unsigned i = 0; i < ndims; i++) { mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); arg.dims[i] = builder.create(loc, idxTy, idxTy, idxTy, arg.arg, dimIdx); } // We only care about lowest order dimension, here. mlir::Value elemSize = builder.createIntegerConstant(loc, idxTy, arg.size); mlir::Value cmp = builder.create( loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2), elemSize); if (!allCompares) { allCompares = cmp; } else { allCompares = builder.create(loc, cmp, allCompares); } } auto ifOp = builder.create(loc, op.op->getResultTypes(), allCompares, /*withElse=*/true); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n"); mlir::Operation *clonedLoop = op.op->clone(); bool changed = false; for (auto &arg : op.argsAndDims) { fir::SequenceType::Shape newShape; newShape.push_back(fir::SequenceType::getUnknownExtent()); auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg.getType()); mlir::Type arrTy = fir::SequenceType::get(newShape, elementType); mlir::Type boxArrTy = fir::BoxType::get(arrTy); mlir::Type refArrTy = builder.getRefType(arrTy); auto carg = builder.create(loc, boxArrTy, arg.arg); auto caddr = builder.create(loc, refArrTy, carg); auto insPt = builder.saveInsertionPoint(); // Use caddr instead of arg. clonedLoop->walk([&](mlir::Operation *coop) { if (!mlir::isa(coop)) return; // Reduce the multi-dimensioned index to a single index. // This is required becase fir arrays do not support multiple dimensions // with unknown dimensions at compile time. // We then calculate the multidimensional array like this: // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x) // where stride is the distance between elements in the dimensions // 0, 1 and 2 or x, y and z. if (coop->getOperand(0) == arg.arg && coop->getOperands().size() >= 2) { builder.setInsertionPoint(coop); mlir::Value totalIndex; for (unsigned i = arg.rank - 1; i > 0; i--) { mlir::Value curIndex = builder.createConvert(loc, idxTy, getIndex(builder, coop, i)); // Multiply by the stride of this array. Later we'll divide by the // element size. mlir::Value scale = builder.createConvert(loc, idxTy, arg.dims[i].getResult(2)); curIndex = builder.create(loc, scale, curIndex); totalIndex = (totalIndex) ? builder.create( loc, curIndex, totalIndex) : curIndex; } // This is the lowest dimension - which doesn't need scaling mlir::Value finalIndex = builder.createConvert(loc, idxTy, getIndex(builder, coop, 0)); if (totalIndex) { assert(llvm::isPowerOf2_32(arg.size) && "Expected power of two here"); unsigned bits = llvm::Log2_32(arg.size); mlir::Value elemShift = builder.createIntegerConstant(loc, idxTy, bits); totalIndex = builder.create( loc, builder.create(loc, totalIndex, elemShift), finalIndex); } else { totalIndex = finalIndex; } auto newOp = builder.create( loc, builder.getRefType(elementType), caddr, mlir::ValueRange{totalIndex}); LLVM_DEBUG(newOp->dump()); coop->getResult(0).replaceAllUsesWith(newOp->getResult(0)); coop->erase(); changed = true; } }); builder.restoreInsertionPoint(insPt); } assert(changed && "Expected operations to have changed"); builder.insert(clonedLoop); // Forward the result(s), if any, from the loop operation to the // mlir::ResultRange results = clonedLoop->getResults(); bool hasResults = (results.size() > 0); if (hasResults) builder.create(loc, results); // Add the original loop in the else-side of the if operation. builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); op.op->replaceAllUsesWith(ifOp); op.op->remove(); builder.insert(op.op); // Rely on "cloned loop has results, so original loop also has results". if (hasResults) { builder.create(loc, op.op->getResults()); } else { // Use an assert to check this. assert(op.op->getResults().size() == 0 && "Weird, the cloned loop doesn't have results, but the original " "does?"); } } LLVM_DEBUG(llvm::dbgs() << "After transform:\n"); LLVM_DEBUG(module->dump()); LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n"); } std::unique_ptr fir::createLoopVersioningPass() { return std::make_unique(); }