//===- TilingInterfaceImpl.cpp - Implementation of TilingInterface -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Interfaces/TilingInterface.h" #include using namespace mlir; using namespace mlir::linalg; //===----------------------------------------------------------------------===// // Utility methods for implementation of Tiling Interface for Linalg ops //===----------------------------------------------------------------------===// /// Return the SSA values that represent the data point accessed using a given /// `indexingMap` for a given point in the iteration space represented by `ivs`. static SmallVector getIndicesForAccess(OpBuilder &b, Location loc, AffineMap indexingMap, ValueRange ivs) { SmallVector indices; indices.reserve(indexingMap.getNumResults()); for (auto result : indexingMap.getResults()) { AffineMap m = AffineMap::get(indexingMap.getNumDims(), indexingMap.getNumSymbols(), result); Value v = b.create(loc, m, ivs); indices.push_back(v); } return indices; } /// Method to inline the payload of a `linalgOp` given the iteration space /// point and values for the arguments of the payload. static LogicalResult inlinePayload(OpBuilder &b, LinalgOp linalgOp, ValueRange ivs, ValueRange argValues) { Block *body = linalgOp.getBlock(); IRMapping map; map.map(body->getArguments(), argValues); for (auto &op : body->without_terminator()) { if (auto indexOp = dyn_cast(&op)) { map.map(indexOp.getResult(), ivs[indexOp.getDim()]); continue; } b.clone(op, map); } Operation *terminator = body->getTerminator(); Location loc = terminator->getLoc(); for (const auto &operand : llvm::enumerate(terminator->getOperands())) { Value toStore = map.lookupOrDefault(operand.value()); OpOperand *storeInto = linalgOp.getDpsInitOperand(operand.index()); auto indices = getIndicesForAccess( b, loc, linalgOp.getMatchingIndexingMap(storeInto), ivs); b.create( loc, toStore, linalgOp.getDpsInitOperand(operand.index())->get(), indices); } return success(); } //===----------------------------------------------------------------------===// // External Model for implementing `TilingInterface` for `LinalgOp`s. //===----------------------------------------------------------------------===// namespace { /// External model implementation of TilingInterface for LinalgOps. An external /// model implementation is used for now till the use of `TilingInterface` is /// on-par with the current Linalg tiling + fusion patterns. Once it is /// maybe possible to move this into the op-definition (though there are /// advantages to leaving it as an external model) template struct LinalgOpTilingInterface : public TilingInterface::ExternalModel, LinalgOpTy> { /// Return the loop iterator type. SmallVector getLoopIteratorTypes(Operation *op) const { LinalgOpTy concreteOp = cast(op); return concreteOp.getIteratorTypesArray(); } /// Return the iteration domain range. SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { OpBuilder::InsertionGuard g(b); b.setInsertionPoint(op); Location loc = op->getLoc(); LinalgOp linalgOp = cast(op); SmallVector allShapesSizes = linalgOp.createFlatListOfOperandDims(b, loc); AffineMap map = linalgOp.getShapesToLoopsMap(); return llvm::to_vector( llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) { OpFoldResult ofr = affine::makeComposedFoldedAffineApply( b, loc, loopExpr, allShapesSizes); return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)}; })); } // Instantiate the tiled implementation of the operation. FailureOr getTiledImplementation(Operation *op, OpBuilder &b, ArrayRef offsets, ArrayRef sizes) const { // Leave the `sizeBounds` value empty. That is only needed when the `sizes` // specified could lead to out of bounds accesses. Location loc = op->getLoc(); LinalgOp linalgOp = cast(op); SmallVector valuesToTile = linalgOp->getOperands(); SmallVector tiledOperands = makeTiledShapes( b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true); SmallVector resultTensorTypes = getTensorOutputTypes(linalgOp, tiledOperands); Operation *tiledOp = clone(b, linalgOp, resultTensorTypes, tiledOperands); offsetIndices(b, cast(tiledOp), offsets); return TilingResult{{tiledOp}, SmallVector(tiledOp->getResults())}; } // Return the details of the output tile generated by the tiled // implementation. LogicalResult getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVector &resultOffsets, SmallVector &resultSizes) const { Location loc = op->getLoc(); LinalgOp linalgOp = cast(op); AffineExpr d0; bindDims(b.getContext(), d0); SmallVector subShapeSizes = llvm::to_vector(llvm::map_range(sizes, [&](OpFoldResult ofr) { return affine::makeComposedFoldedAffineApply(b, loc, d0 - 1, ofr); })); OpOperand *outOperand = linalgOp.getDpsInitOperand(resultNumber); SliceParameters sliceParams = computeSliceParameters( b, loc, outOperand->get(), sizes, linalgOp.getMatchingIndexingMap(outOperand), offsets, /*ubs*/ {}, subShapeSizes, true); resultOffsets = sliceParams.offsets; resultSizes = sliceParams.sizes; return success(); } FailureOr generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes) const { auto linalgOp = cast(op); // Check that the indexing map used for the output is a projected // permutation. This could be relaxed with a more general approach that can // map the offsets and sizes from the result to iteration space tiles // (filling in full extent for dimensions not used to access the result). AffineMap indexingMap = linalgOp.getIndexingMapMatchingResult(op->getResult(resultNumber)); if (!indexingMap.isProjectedPermutation()) { return op->emitOpError( "unhandled tiled implementation generation when result is not " "accessed using a permuted projection"); } auto numLoops = linalgOp.getNumLoops(); auto tilingInterfaceOp = cast(op); SmallVector iterationTileOffsets(numLoops), iterationTileSizes(numLoops); if (!indexingMap.isPermutation()) { SmallVector iterationDomain = tilingInterfaceOp.getIterationDomain(b); for (const auto &range : llvm::enumerate(iterationDomain)) { iterationTileOffsets[range.index()] = range.value().offset; iterationTileSizes[range.index()] = range.value().size; } } for (const auto &resultExpr : llvm::enumerate(indexingMap.getResults())) { unsigned dimPosition = cast(resultExpr.value()).getPosition(); iterationTileOffsets[dimPosition] = offsets[resultExpr.index()]; iterationTileSizes[dimPosition] = sizes[resultExpr.index()]; } FailureOr tilingResult = tilingInterfaceOp.getTiledImplementation(b, iterationTileOffsets, iterationTileSizes); if (tilingResult->tiledOps.size() != 1) return op->emitOpError("failed to generate tiled implementation"); return TilingResult{ tilingResult->tiledOps, SmallVector{tilingResult->tiledValues[resultNumber]}}; } LogicalResult generateScalarImplementation(Operation *op, OpBuilder &builder, Location loc, ValueRange ivs) const { auto linalgOp = cast(op); if (!linalgOp.hasPureBufferSemantics()) return op->emitOpError("expected operation to have buffer semantics"); SmallVector indexedValues; indexedValues.reserve(linalgOp->getNumOperands()); Location linalgOpLoc = op->getLoc(); /// Load the data corresponding to the block arguments that /// represent input operands. for (OpOperand &operand : linalgOp->getOpOperands()) { if (!linalgOp.payloadUsesValueFromOperand(&operand)) { indexedValues.push_back(nullptr); continue; } if (linalgOp.isScalar(&operand)) { indexedValues.push_back(operand.get()); continue; } SmallVector indices = getIndicesForAccess( builder, linalgOpLoc, linalgOp.getMatchingIndexingMap(&operand), ivs); Value load = builder.create(linalgOpLoc, operand.get(), indices); indexedValues.push_back(load); } /// Inline the op payload and store the result. return inlinePayload(builder, linalgOp, ivs, indexedValues); } }; //===----------------------------------------------------------------------===// // External Model for implementing `PartialReductionInterface` for `LinalgOp`s. //===----------------------------------------------------------------------===// /// External model implementation of PartialReductionInterface for LinalgOps. template struct LinalgOpPartialReductionInterface : public PartialReductionOpInterface::ExternalModel< LinalgOpPartialReductionInterface, LinalgOpTy> { FailureOr generateInitialTensorForPartialReduction( Operation *op, OpBuilder &b, Location loc, ArrayRef sizes, ArrayRef reductionDims) const { auto linalgOp = cast(op); OpBuilder::InsertionGuard guard(b); if (linalgOp.hasPureBufferSemantics()) return op->emitOpError("expected operation to have tensor semantics"); // Insert the new parallel dimension based on the index of the reduction // loops. This could be controlled by user for more flexibility. SmallVector combinerOps; if (!matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps) || combinerOps.size() != 1) return op->emitOpError("Failed to anaysis the reduction operation."); Operation *reductionOp = combinerOps[0]; std::optional identity = arith::getNeutralElement(reductionOp); if (!identity.has_value()) return op->emitOpError( "Failed to get an identity value for the reduction operation."); ArrayRef oldShape = linalgOp.getShape(linalgOp.getDpsInitOperand(0)); // Extend tile size vector to the rank of the output tensor. SmallVector tileSizeVector = getValueOrCreateConstantIndexOp(b, loc, sizes); if (tileSizeVector.size() < oldShape.size()) { auto zero = b.create(loc, 0); tileSizeVector.append(oldShape.size() - tileSizeVector.size(), zero); } // Calculate the new shape, we insert the new dimensions based on the index // of the reduction dimensions. SmallVector newOutputShape; SmallVector dynamicDims; int64_t currReductionDims = 0; DenseSet reductionDimsSet(reductionDims.begin(), reductionDims.end()); for (int64_t idx : llvm::seq(0, oldShape.size() + reductionDims.size())) { if (reductionDimsSet.contains(idx)) { dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape); currReductionDims++; continue; } int64_t oldIdx = idx - currReductionDims; int64_t dim = oldShape[oldIdx]; newOutputShape.push_back(dim); if (ShapedType::isDynamic(dim)) dynamicDims.push_back(b.create( loc, linalgOp.getDpsInitOperand(0)->get(), oldIdx)); } Value emptyTensor = b.create( loc, newOutputShape, linalgOp.getRegionOutputArgs()[0].getType(), dynamicDims); Value constantOp = b.create(loc, *identity); auto identityTensor = b.create(loc, constantOp, emptyTensor); return identityTensor.getOperation(); } Operation *tileToPartialReduction(Operation *op, OpBuilder &b, Location loc, ValueRange init, ArrayRef offsets, ArrayRef sizes, ArrayRef reductionDims) const { OpBuilder::InsertionGuard guard(b); auto linalgOp = cast(op); AffineMap oldOutputMap = linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0)); SmallVector outputExpr(oldOutputMap.getNumResults() + reductionDims.size()); for (int idx : reductionDims) outputExpr[idx] = b.getAffineDimExpr(idx); int currExpr = 0; for (int idx : llvm::seq(0, outputExpr.size())) { if (outputExpr[idx]) continue; outputExpr[idx] = oldOutputMap.getResult(currExpr++); } // Step 1: Extract a slice of the input operands. SmallVector valuesToTile = linalgOp.getDpsInputs(); SmallVector tiledOperands = makeTiledShapes( b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true); // Step 2: Extract the accumulator operands SmallVector strides(offsets.size(), b.getIndexAttr(1)); SmallVector outOffsets(offsets.size(), b.getIndexAttr(0)); // TODO: use SubsetExtractOpInterface once it is available. Value out = b.create(loc, init[0], outOffsets, sizes, strides); // Step3. Create a generic op where the reduction dimensions are replaced // by a parallel dimension of the size of reduction. SmallVector newIteratorTypes = linalgOp.getIteratorTypesArray(); for (int dim : reductionDims) newIteratorTypes[dim] = utils::IteratorType::parallel; SmallVector newMaps = linalgOp.getIndexingMapsArray(); newMaps.back() = AffineMap::get(newMaps.back().getNumDims(), 0, outputExpr, linalgOp.getContext()); auto genericOp = b.create(loc, TypeRange({out.getType()}), tiledOperands, ValueRange({out}), newMaps, newIteratorTypes); IRMapping mapping; op->getRegion(0).cloneInto(&genericOp.getRegion(), genericOp.getRegion().begin(), mapping); return genericOp.getOperation(); } Operation *mergeReductions(Operation *op, OpBuilder &b, Location loc, ValueRange partialReduce, ArrayRef reductionDims) const { auto linalgOp = cast(op); DenseSet reductionDimsSet(reductionDims.begin(), reductionDims.end()); // Then create a new reduction that only reduce the newly added dimensions // from the previous op. int64_t intermRank = cast(partialReduce[0].getType()).getRank(); AffineMap inputMap = b.getMultiDimIdentityMap(intermRank); SmallVector reductionIteratorTypes; SmallVector exprs; for (int64_t i : llvm::seq(0, intermRank)) { if (reductionDimsSet.contains(i)) { reductionIteratorTypes.push_back(utils::IteratorType::reduction); } else { exprs.push_back(b.getAffineDimExpr(i)); reductionIteratorTypes.push_back(utils::IteratorType::parallel); } } AffineMap outputMap = AffineMap::get(intermRank, 0, exprs, op->getContext()); SmallVector reductionMaps = {inputMap, outputMap}; SmallVector combinerOps; matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps); Operation *reductionOp = combinerOps[0]; auto reduction = b.create( loc, op->getResultTypes(), ValueRange({partialReduce[0]}), linalgOp.getDpsInits(), reductionMaps, reductionIteratorTypes, [reductionOp](OpBuilder &b, Location loc, ValueRange inputs) { Operation *clonedReductionOp = b.clone(*reductionOp); clonedReductionOp->setOperand(0, inputs[0]); clonedReductionOp->setOperand(1, inputs[1]); b.create(loc, clonedReductionOp->getResult(0)); }); return reduction.getOperation(); } }; } // namespace template static void registerOne(MLIRContext *ctx) { OpType::template attachInterface>(*ctx); OpType::template attachInterface>( *ctx); } /// Variadic helper function. template static void registerAll(MLIRContext *ctx) { (registerOne(ctx), ...); } #define GET_OP_LIST void mlir::linalg::registerTilingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) { registerOne(ctx); registerAll< #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" >(ctx); }); }