//===- TosaToLinalg.cpp - Lowering Tosa to Linalg Dialect -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // These rewriters lower from the Tosa to the Linalg dialect. // //===----------------------------------------------------------------------===// #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h" #include "mlir/Dialect/Utils/ReshapeOpsUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" #include using namespace mlir; using namespace mlir::tosa; template static arith::ConstantOp createConstFromIntAttribute(Operation *op, const std::string &attrName, Type requiredAttrType, OpBuilder &rewriter) { auto castedN = static_cast( cast(op->getAttr(attrName)).getValue().getSExtValue()); return rewriter.create( op->getLoc(), IntegerAttr::get(requiredAttrType, castedN)); } static Value createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, ArrayRef resultTypes, PatternRewriter &rewriter) { Location loc = op->getLoc(); auto elementTy = cast(op->getOperand(0).getType()).getElementType(); // tosa::AbsOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); if (isa(op) && isa(elementTy)) { auto zero = rewriter.create( loc, rewriter.getZeroAttr(elementTy)); auto cmp = rewriter.create(loc, arith::CmpIPredicate::sgt, args[0], zero); auto neg = rewriter.create(loc, zero, args[0]); return rewriter.create(loc, cmp, args[0], neg); } // tosa::AddOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::SubOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::MulOp if (isa(op) && isa(elementTy)) { if (dyn_cast(op).getShift() != 0) { (void)rewriter.notifyMatchFailure(op, "Cannot have shift value for float"); return nullptr; } return rewriter.create(loc, resultTypes, args); } // tosa::DivOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::ReciprocalOp if (isa(op) && isa(elementTy)) { auto one = rewriter.create(loc, FloatAttr::get(elementTy, 1)); return rewriter.create(loc, resultTypes, one, args[0]); } if (isa(op) && isa(elementTy)) { Value a = args[0]; Value b = args[1]; auto shift = cast(op->getAttr("shift")).getValue().getSExtValue(); if (shift > 0) { auto shiftConst = rewriter.create(loc, shift, /*bitwidth=*/8); if (!a.getType().isInteger(32)) a = rewriter.create(loc, rewriter.getI32Type(), a); if (!b.getType().isInteger(32)) b = rewriter.create(loc, rewriter.getI32Type(), b); auto result = rewriter.create( loc, rewriter.getI32Type(), a, b, shiftConst, rewriter.getBoolAttr(false)); if (elementTy.isInteger(32)) return result; return rewriter.create(loc, elementTy, result); } int aWidth = a.getType().getIntOrFloatBitWidth(); int bWidth = b.getType().getIntOrFloatBitWidth(); int cWidth = resultTypes[0].getIntOrFloatBitWidth(); if (aWidth < cWidth) a = rewriter.create(loc, resultTypes[0], a); if (bWidth < cWidth) b = rewriter.create(loc, resultTypes[0], b); return rewriter.create(loc, resultTypes, a, b); } // tosa::NegateOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); if (isa(op) && isa(elementTy) && !cast(op).getQuantizationInfo()) { auto constant = rewriter.create(loc, IntegerAttr::get(elementTy, 0)); return rewriter.create(loc, resultTypes, constant, args[0]); } if (isa(op) && isa(elementTy) && cast(op).getQuantizationInfo()) { auto quantizationInfo = cast(op).getQuantizationInfo(); int32_t inputBitWidth = elementTy.getIntOrFloatBitWidth(); int64_t inZp = quantizationInfo.value().getInputZp(); int64_t outZp = quantizationInfo.value().getOutputZp(); // Compute the maximum value that can occur in the intermediate buffer. int64_t zpAdd = inZp + outZp; int64_t maxValue = APInt::getSignedMaxValue(inputBitWidth).getSExtValue() + std::abs(zpAdd) + 1; // Convert that maximum value into the maximum bitwidth needed to represent // it. We assume 48-bit numbers may be supported further in the pipeline. int intermediateBitWidth = 64; if (maxValue <= APInt::getSignedMaxValue(16).getSExtValue()) { intermediateBitWidth = 16; } else if (maxValue <= APInt::getSignedMaxValue(32).getSExtValue()) { intermediateBitWidth = 32; } else if (maxValue <= APInt::getSignedMaxValue(48).getSExtValue()) { intermediateBitWidth = 48; } Type intermediateType = rewriter.getIntegerType(intermediateBitWidth); Value zpAddValue = rewriter.create( loc, rewriter.getIntegerAttr(intermediateType, zpAdd)); // The negation can be applied by doing: // outputValue = inZp + outZp - inputValue auto ext = rewriter.create(loc, intermediateType, args[0]); auto sub = rewriter.create(loc, zpAddValue, ext); // Clamp to the negation range. Value min = rewriter.create( loc, APInt::getSignedMinValue(inputBitWidth).getSExtValue(), intermediateType); Value max = rewriter.create( loc, APInt::getSignedMaxValue(inputBitWidth).getSExtValue(), intermediateType); auto clamp = clampIntHelper(loc, sub, min, max, rewriter); // Truncate to the final value. return rewriter.create(loc, elementTy, clamp); } // tosa::BitwiseAndOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::BitwiseOrOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::BitwiseNotOp if (isa(op) && isa(elementTy)) { auto allOnesAttr = rewriter.getIntegerAttr( elementTy, APInt::getAllOnes(elementTy.getIntOrFloatBitWidth())); auto allOnes = rewriter.create(loc, allOnesAttr); return rewriter.create(loc, resultTypes, args[0], allOnes); } // tosa::BitwiseXOrOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::LogicalLeftShiftOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::LogicalRightShiftOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::ArithmeticRightShiftOp if (isa(op) && isa(elementTy)) { auto result = rewriter.create(loc, resultTypes, args); auto round = cast(op->getAttr("round")).getValue(); if (!round) { return result; } Type i1Ty = IntegerType::get(rewriter.getContext(), /*width=*/1); auto one = rewriter.create(loc, IntegerAttr::get(elementTy, 1)); auto zero = rewriter.create(loc, IntegerAttr::get(elementTy, 0)); auto i1one = rewriter.create(loc, IntegerAttr::get(i1Ty, 1)); // Checking that input2 != 0 auto shiftValueGreaterThanZero = rewriter.create( loc, arith::CmpIPredicate::sgt, args[1], zero); // Checking for the last bit of input1 to be 1 auto subtract = rewriter.create(loc, resultTypes, args[1], one); auto shifted = rewriter.create(loc, resultTypes, args[0], subtract) ->getResults(); auto truncated = rewriter.create(loc, i1Ty, shifted, std::nullopt); auto isInputOdd = rewriter.create(loc, i1Ty, truncated, i1one); auto shouldRound = rewriter.create( loc, i1Ty, shiftValueGreaterThanZero, isInputOdd); auto extended = rewriter.create(loc, resultTypes, shouldRound); return rewriter.create(loc, resultTypes, result, extended); } // tosa::ClzOp if (isa(op) && isa(elementTy)) { return rewriter.create(loc, elementTy, args[0]); } // tosa::LogicalAnd if (isa(op) && elementTy.isInteger(1)) return rewriter.create(loc, resultTypes, args); // tosa::LogicalNot if (isa(op) && elementTy.isInteger(1)) { auto one = rewriter.create( loc, rewriter.getIntegerAttr(elementTy, 1)); return rewriter.create(loc, resultTypes, args[0], one); } // tosa::LogicalOr if (isa(op) && elementTy.isInteger(1)) return rewriter.create(loc, resultTypes, args); // tosa::LogicalXor if (isa(op) && elementTy.isInteger(1)) return rewriter.create(loc, resultTypes, args); // tosa::PowOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::RsqrtOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::LogOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::ExpOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::TanhOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::ErfOp if (isa(op) && llvm::isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::GreaterOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, arith::CmpFPredicate::OGT, args[0], args[1]); if (isa(op) && elementTy.isSignlessInteger()) return rewriter.create(loc, arith::CmpIPredicate::sgt, args[0], args[1]); // tosa::GreaterEqualOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, arith::CmpFPredicate::OGE, args[0], args[1]); if (isa(op) && elementTy.isSignlessInteger()) return rewriter.create(loc, arith::CmpIPredicate::sge, args[0], args[1]); // tosa::EqualOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, arith::CmpFPredicate::OEQ, args[0], args[1]); if (isa(op) && elementTy.isSignlessInteger()) return rewriter.create(loc, arith::CmpIPredicate::eq, args[0], args[1]); // tosa::SelectOp if (isa(op)) { elementTy = cast(op->getOperand(1).getType()).getElementType(); if (isa(elementTy) || isa(elementTy)) return rewriter.create(loc, args[0], args[1], args[2]); } // tosa::MaximumOp if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args[0], args[1]); } if (isa(op) && elementTy.isSignlessInteger()) { auto predicate = rewriter.create( loc, arith::CmpIPredicate::sgt, args[0], args[1]); return rewriter.create(loc, predicate, args[0], args[1]); } // tosa::MinimumOp if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args[0], args[1]); } if (isa(op) && elementTy.isSignlessInteger()) { auto predicate = rewriter.create( loc, arith::CmpIPredicate::slt, args[0], args[1]); return rewriter.create(loc, predicate, args[0], args[1]); } // tosa::CeilOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::FloorOp if (isa(op) && isa(elementTy)) return rewriter.create(loc, resultTypes, args); // tosa::ClampOp if (isa(op) && isa(elementTy)) { bool losesInfo = false; APFloat minApf = cast(op->getAttr("min_fp")).getValue(); APFloat maxApf = cast(op->getAttr("max_fp")).getValue(); minApf.convert(cast(elementTy).getFloatSemantics(), APFloat::rmNearestTiesToEven, &losesInfo); maxApf.convert(cast(elementTy).getFloatSemantics(), APFloat::rmNearestTiesToEven, &losesInfo); auto min = rewriter.create( loc, elementTy, rewriter.getFloatAttr(elementTy, minApf)); auto max = rewriter.create( loc, elementTy, rewriter.getFloatAttr(elementTy, maxApf)); return clampFloatHelper(loc, args[0], min, max, rewriter); } if (isa(op) && isa(elementTy)) { auto intTy = cast(elementTy); int32_t min = static_cast( cast(op->getAttr("min_int")).getValue().getSExtValue()); int32_t max = static_cast( cast(op->getAttr("max_int")).getValue().getSExtValue()); if (intTy.isUnsignedInteger()) { min = std::max(min, 0); max = std::min( max, APInt::getMaxValue(intTy.getIntOrFloatBitWidth()).getSExtValue()); } else { min = std::max( min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth()) .getSExtValue()); max = std::min( max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth()) .getSExtValue()); } auto minVal = rewriter.create( loc, min, intTy.getIntOrFloatBitWidth()); auto maxVal = rewriter.create( loc, max, intTy.getIntOrFloatBitWidth()); return clampIntHelper(loc, args[0], minVal, maxVal, rewriter); } // tosa::SigmoidOp if (isa(op) && isa(elementTy)) { auto one = rewriter.create(loc, FloatAttr::get(elementTy, 1)); auto negate = rewriter.create(loc, resultTypes, args[0]); auto exp = rewriter.create(loc, resultTypes, negate); auto added = rewriter.create(loc, resultTypes, exp, one); return rewriter.create(loc, resultTypes, one, added); } // tosa::CastOp if (isa(op)) { Type srcTy = elementTy; Type dstTy = resultTypes.front(); bool bitExtend = srcTy.getIntOrFloatBitWidth() < dstTy.getIntOrFloatBitWidth(); if (srcTy == dstTy) return args.front(); if (isa(srcTy) && isa(dstTy) && bitExtend) return rewriter.create(loc, resultTypes, args, std::nullopt); if (isa(srcTy) && isa(dstTy) && !bitExtend) return rewriter.create(loc, resultTypes, args, std::nullopt); // 1-bit integers need to be treated as signless. if (srcTy.isInteger(1) && arith::UIToFPOp::areCastCompatible(srcTy, dstTy)) return rewriter.create(loc, resultTypes, args, std::nullopt); if (srcTy.isInteger(1) && isa(dstTy) && bitExtend) return rewriter.create(loc, resultTypes, args, std::nullopt); // Unsigned integers need an unrealized cast so that they can be passed // to UIToFP. if (srcTy.isUnsignedInteger() && isa(dstTy)) { auto unrealizedCast = rewriter .create( loc, rewriter.getIntegerType(srcTy.getIntOrFloatBitWidth()), args[0]) .getResult(0); return rewriter.create(loc, resultTypes[0], unrealizedCast); } // All other si-to-fp conversions should be handled by SIToFP. if (arith::SIToFPOp::areCastCompatible(srcTy, dstTy)) return rewriter.create(loc, resultTypes, args, std::nullopt); // Casting to boolean, floats need to only be checked as not-equal to zero. if (isa(srcTy) && dstTy.isInteger(1)) { Value zero = rewriter.create( loc, rewriter.getFloatAttr(srcTy, 0.0)); return rewriter.create(loc, arith::CmpFPredicate::UNE, args.front(), zero); } if (arith::FPToSIOp::areCastCompatible(srcTy, dstTy)) { auto intMin = rewriter.create( loc, rewriter.getFloatAttr( getElementTypeOrSelf(srcTy), APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth()) .getSExtValue())); auto intMax = rewriter.create( loc, rewriter.getFloatAttr( getElementTypeOrSelf(srcTy), APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth()) .getSExtValue())); auto rounded = rewriter.create(loc, args[0]); auto clamped = clampFloatHelper(loc, rounded, intMin, intMax, rewriter); return rewriter.create(loc, dstTy, clamped); } // Casting to boolean, integers need to only be checked as not-equal to // zero. if (isa(srcTy) && dstTy.isInteger(1)) { Value zero = rewriter.create( loc, 0, srcTy.getIntOrFloatBitWidth()); return rewriter.create(loc, arith::CmpIPredicate::ne, args.front(), zero); } if (isa(srcTy) && isa(dstTy) && bitExtend) return rewriter.create(loc, resultTypes, args, std::nullopt); if (isa(srcTy) && isa(dstTy) && !bitExtend) { return rewriter.create(loc, dstTy, args[0]); } } (void)rewriter.notifyMatchFailure( op, "unhandled op for linalg body calculation for elementwise op"); return nullptr; } static Value expandRank(PatternRewriter &rewriter, Location loc, Value tensor, int64_t rank) { // No need to expand if we are already at the desired rank auto shapedType = dyn_cast(tensor.getType()); assert(shapedType && shapedType.hasRank() && "expected a ranked shaped type"); int64_t numExtraDims = rank - shapedType.getRank(); assert(numExtraDims >= 0 && "cannot expand tensor to a lower rank"); if (!numExtraDims) return tensor; // Compute reassociation indices SmallVector> reassociationIndices( shapedType.getRank()); int64_t index = 0; for (index = 0; index <= numExtraDims; index++) reassociationIndices[0].push_back(index); for (size_t position = 1; position < reassociationIndices.size(); position++) reassociationIndices[position].push_back(index++); // Compute result type SmallVector resultShape; for (index = 0; index < numExtraDims; index++) resultShape.push_back(1); for (auto size : shapedType.getShape()) resultShape.push_back(size); auto resultType = RankedTensorType::get(resultShape, shapedType.getElementType()); // Emit 'tensor.expand_shape' op return rewriter.create(loc, resultType, tensor, reassociationIndices); } static SmallVector expandInputRanks(PatternRewriter &rewriter, Location loc, Operation *operation) { auto rank = operation->getResultTypes().front().cast().getRank(); return llvm::map_to_vector(operation->getOperands(), [&](Value operand) { return expandRank(rewriter, loc, operand, rank); }); } using IndexPool = DenseMap; // Emit an 'arith.constant' op for the given index if it has not been created // yet, or return an existing constant. This will prevent an excessive creation // of redundant constants, easing readability of emitted code for unit tests. static Value createIndex(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, int64_t index) { auto [it, inserted] = indexPool.try_emplace(index); if (inserted) it->second = rewriter.create(loc, rewriter.getIndexAttr(index)); return it->second; } static Value getTensorDim(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, Value tensor, int64_t index) { auto indexValue = createIndex(rewriter, loc, indexPool, index); return rewriter.create(loc, tensor, indexValue).getResult(); } static OpFoldResult getOrFoldTensorDim(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, Value tensor, int64_t index) { auto shapedType = dyn_cast(tensor.getType()); assert(shapedType && shapedType.hasRank() && "expected a ranked shaped type"); assert(index >= 0 && index < shapedType.getRank() && "index out of bounds"); if (shapedType.isDynamicDim(index)) return getTensorDim(rewriter, loc, indexPool, tensor, index); return rewriter.getIndexAttr(shapedType.getDimSize(index)); } static bool operandsAndResultsRanked(Operation *operation) { auto isRanked = [](Value value) { return isa(value.getType()); }; return llvm::all_of(operation->getOperands(), isRanked) && llvm::all_of(operation->getResults(), isRanked); } // Compute the runtime dimension size for dimension 'dim' of the output by // inspecting input 'operands', all of which are expected to have the same rank. // This function returns a pair {targetSize, masterOperand}. // // The runtime size of the output dimension is returned either as a statically // computed attribute or as a runtime SSA value. // // If the target size was inferred directly from one dominating operand, that // operand is returned in 'masterOperand'. If the target size is inferred from // multiple operands, 'masterOperand' is set to nullptr. static std::pair computeTargetSize(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, ValueRange operands, int64_t dim) { // If any input operand contains a static size greater than 1 for this // dimension, that is the target size. An occurrence of an additional static // dimension greater than 1 with a different value is undefined behavior. for (auto operand : operands) { auto size = operand.getType().cast().getDimSize(dim); if (!ShapedType::isDynamic(size) && size > 1) return {rewriter.getIndexAttr(size), operand}; } // Filter operands with dynamic dimension auto operandsWithDynamicDim = llvm::to_vector(llvm::make_filter_range(operands, [&](Value operand) { return operand.getType().cast().isDynamicDim(dim); })); // If no operand has a dynamic dimension, it means all sizes were 1 if (operandsWithDynamicDim.empty()) return {rewriter.getIndexAttr(1), operands.front()}; // Emit code that computes the runtime size for this dimension. If there is // only one operand with a dynamic dimension, it is considered the master // operand that determines the runtime size of the output dimension. auto targetSize = getTensorDim(rewriter, loc, indexPool, operandsWithDynamicDim[0], dim); if (operandsWithDynamicDim.size() == 1) return {targetSize, operandsWithDynamicDim[0]}; // Calculate maximum size among all dynamic dimensions for (size_t i = 1; i < operandsWithDynamicDim.size(); i++) { auto nextSize = getTensorDim(rewriter, loc, indexPool, operandsWithDynamicDim[i], dim); targetSize = rewriter.create(loc, targetSize, nextSize); } return {targetSize, nullptr}; } // Compute the runtime output size for all dimensions. This function returns // a pair {targetShape, masterOperands}. static std::pair, SmallVector> computeTargetShape(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, ValueRange operands) { assert(!operands.empty()); auto rank = operands.front().getType().cast().getRank(); SmallVector targetShape; SmallVector masterOperands; for (auto dim : llvm::seq(0, rank)) { auto [targetSize, masterOperand] = computeTargetSize(rewriter, loc, indexPool, operands, dim); targetShape.push_back(targetSize); masterOperands.push_back(masterOperand); } return {targetShape, masterOperands}; } static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, Value operand, int64_t dim, OpFoldResult targetSize, Value masterOperand) { // Nothing to do if this is a static dimension auto rankedTensorType = operand.getType().cast(); if (!rankedTensorType.isDynamicDim(dim)) return operand; // If the target size for this dimension was directly inferred by only taking // this operand into account, there is no need to broadcast. This is an // optimization that will prevent redundant control flow, and constitutes the // main motivation for tracking "master operands". if (operand == masterOperand) return operand; // Affine maps for 'linalg.generic' op auto rank = rankedTensorType.getRank(); SmallVector affineExprs; for (auto index : llvm::seq(0, rank)) { auto affineExpr = index == dim ? rewriter.getAffineConstantExpr(0) : rewriter.getAffineDimExpr(index); affineExprs.push_back(affineExpr); } auto broadcastAffineMap = AffineMap::get(rank, 0, affineExprs, rewriter.getContext()); auto identityAffineMap = rewriter.getMultiDimIdentityMap(rank); SmallVector affineMaps = {broadcastAffineMap, identityAffineMap}; // Check if broadcast is necessary auto one = createIndex(rewriter, loc, indexPool, 1); auto runtimeSize = getTensorDim(rewriter, loc, indexPool, operand, dim); auto broadcastNecessary = rewriter.create( loc, arith::CmpIPredicate::eq, runtimeSize, one); // Emit 'then' region of 'scf.if' auto emitThenRegion = [&](OpBuilder &opBuilder, Location loc) { // Emit 'tensor.empty' op SmallVector outputTensorShape; for (auto index : llvm::seq(0, rank)) { auto size = index == dim ? targetSize : getOrFoldTensorDim(rewriter, loc, indexPool, operand, index); outputTensorShape.push_back(size); } Value outputTensor = opBuilder.create( loc, outputTensorShape, rankedTensorType.getElementType()); // Emit 'linalg.generic' op auto resultTensor = opBuilder .create( loc, outputTensor.getType(), operand, outputTensor, affineMaps, getNParallelLoopsAttrs(rank), [&](OpBuilder &opBuilder, Location loc, ValueRange blockArgs) { // Emit 'linalg.yield' op opBuilder.create(loc, blockArgs.front()); }) .getResult(0); // Cast to original operand type if necessary auto castResultTensor = rewriter.createOrFold( loc, operand.getType(), resultTensor); // Emit 'scf.yield' op opBuilder.create(loc, castResultTensor); }; // Emit 'else' region of 'scf.if' auto emitElseRegion = [&](OpBuilder &opBuilder, Location loc) { opBuilder.create(loc, operand); }; // Emit 'scf.if' op auto ifOp = rewriter.create(loc, broadcastNecessary, emitThenRegion, emitElseRegion); return ifOp.getResult(0); } static Value broadcastDynamicDimensions(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, Value operand, ArrayRef targetShape, ArrayRef masterOperands) { size_t rank = operand.getType().cast().getRank(); assert(targetShape.size() == rank); assert(masterOperands.size() == rank); for (auto index : llvm::seq(0, rank)) operand = broadcastDynamicDimension(rewriter, loc, indexPool, operand, index, targetShape[index], masterOperands[index]); return operand; } static SmallVector broadcastDynamicDimensions(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, ValueRange operands, ArrayRef targetShape, ArrayRef masterOperands) { // No need to broadcast for unary operations if (operands.size() == 1) return operands; // Broadcast dynamic dimensions operand by operand return llvm::map_to_vector(operands, [&](Value operand) { return broadcastDynamicDimensions(rewriter, loc, indexPool, operand, targetShape, masterOperands); }); } static LogicalResult emitElementwiseComputation(PatternRewriter &rewriter, Location loc, Operation *operation, ValueRange operands, ArrayRef targetShape) { // Generate output tensor auto resultType = operation->getResultTypes().front().cast(); Value outputTensor = rewriter.create( loc, targetShape, resultType.getElementType()); // Create affine maps. Input affine maps broadcast static dimensions of size // 1. The output affine map is an identity map. // auto rank = resultType.getRank(); auto affineMaps = llvm::map_to_vector(operands, [&](Value operand) { auto shape = cast(operand.getType()).getShape(); SmallVector affineExprs; for (auto it : llvm::enumerate(shape)) { auto affineExpr = it.value() == 1 ? rewriter.getAffineConstantExpr(0) : rewriter.getAffineDimExpr(it.index()); affineExprs.push_back(affineExpr); } return AffineMap::get(rank, 0, affineExprs, rewriter.getContext()); }); affineMaps.push_back(rewriter.getMultiDimIdentityMap(rank)); // Emit 'linalg.generic' op bool encounteredError = false; auto linalgOp = rewriter.create( loc, outputTensor.getType(), operands, outputTensor, affineMaps, getNParallelLoopsAttrs(rank), [&](OpBuilder &opBuilder, Location loc, ValueRange blockArgs) { Value opResult = createLinalgBodyCalculationForElementwiseOp( operation, blockArgs.take_front(operation->getNumOperands()), {resultType.getElementType()}, rewriter); if (!opResult) { encounteredError = true; return; } opBuilder.create(loc, opResult); }); if (encounteredError) return rewriter.notifyMatchFailure( operation, "unable to create linalg.generic body for elementwise op"); // Cast 'linalg.generic' result into original result type if needed auto castResult = rewriter.createOrFold( loc, resultType, linalgOp->getResult(0)); rewriter.replaceOp(operation, castResult); return success(); } static LogicalResult elementwiseMatchAndRewriteHelper(Operation *operation, PatternRewriter &rewriter) { // Collect op properties assert(operation->getNumResults() == 1 && "elementwise op expects 1 result"); assert(operation->getNumOperands() >= 1 && "elementwise op expects at least 1 operand"); if (!operandsAndResultsRanked(operation)) return rewriter.notifyMatchFailure(operation, "Unranked tensors not supported"); // Lower operation IndexPool indexPool; auto loc = operation->getLoc(); auto expandedOperands = expandInputRanks(rewriter, loc, operation); auto [targetShape, masterOperands] = computeTargetShape(rewriter, loc, indexPool, expandedOperands); auto broadcastOperands = broadcastDynamicDimensions( rewriter, loc, indexPool, expandedOperands, targetShape, masterOperands); return emitElementwiseComputation(rewriter, loc, operation, broadcastOperands, targetShape); } // Returns the constant initial value for a given reduction operation. The // attribute type varies depending on the element type required. static TypedAttr createInitialValueForReduceOp(Operation *op, Type elementTy, PatternRewriter &rewriter) { if (isa(op) && isa(elementTy)) return rewriter.getFloatAttr(elementTy, 0.0); if (isa(op) && isa(elementTy)) return rewriter.getIntegerAttr(elementTy, 0); if (isa(op) && isa(elementTy)) return rewriter.getFloatAttr(elementTy, 1.0); if (isa(op) && isa(elementTy)) return rewriter.getIntegerAttr(elementTy, 1); if (isa(op) && isa(elementTy)) return rewriter.getFloatAttr( elementTy, APFloat::getLargest( cast(elementTy).getFloatSemantics(), false)); if (isa(op) && isa(elementTy)) return rewriter.getIntegerAttr( elementTy, APInt::getSignedMaxValue(elementTy.getIntOrFloatBitWidth())); if (isa(op) && isa(elementTy)) return rewriter.getFloatAttr( elementTy, APFloat::getLargest( cast(elementTy).getFloatSemantics(), true)); if (isa(op) && isa(elementTy)) return rewriter.getIntegerAttr( elementTy, APInt::getSignedMinValue(elementTy.getIntOrFloatBitWidth())); if (isa(op) && elementTy.isInteger(1)) return rewriter.getIntegerAttr(elementTy, APInt::getAllOnes(1)); if (isa(op) && elementTy.isInteger(1)) return rewriter.getIntegerAttr(elementTy, APInt::getZero(1)); if (isa(op) && isa(elementTy)) return rewriter.getFloatAttr( elementTy, APFloat::getLargest( cast(elementTy).getFloatSemantics(), true)); if (isa(op) && isa(elementTy)) return rewriter.getIntegerAttr( elementTy, APInt::getSignedMinValue(elementTy.getIntOrFloatBitWidth())); return {}; } // Creates the body calculation for a reduction. The operations vary depending // on the input type. static Value createLinalgBodyCalculationForReduceOp(Operation *op, ValueRange args, Type elementTy, PatternRewriter &rewriter) { Location loc = op->getLoc(); if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args); } if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args); } if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args); } if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args); } if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args[0], args[1]); } if (isa(op) && isa(elementTy)) { auto predicate = rewriter.create( loc, arith::CmpIPredicate::slt, args[0], args[1]); return rewriter.create(loc, predicate, args[0], args[1]); } if (isa(op) && isa(elementTy)) { return rewriter.create(loc, args[0], args[1]); } if (isa(op) && isa(elementTy)) { auto predicate = rewriter.create( loc, arith::CmpIPredicate::sgt, args[0], args[1]); return rewriter.create(loc, predicate, args[0], args[1]); } if (isa(op) && elementTy.isInteger(1)) return rewriter.create(loc, args); if (isa(op) && elementTy.isInteger(1)) return rewriter.create(loc, args); return {}; } // Performs the match and rewrite for reduction operations. This includes // declaring a correctly sized initial value, and the linalg.generic operation // that reduces across the specified axis. static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis, PatternRewriter &rewriter) { auto loc = op->getLoc(); auto inputTy = cast(op->getOperand(0).getType()); auto resultTy = cast(op->getResult(0).getType()); auto elementTy = resultTy.getElementType(); Value input = op->getOperand(0); SmallVector reduceShape; SmallVector dynDims; for (unsigned i = 0; i < inputTy.getRank(); i++) { if (axis != i) { reduceShape.push_back(inputTy.getDimSize(i)); if (inputTy.isDynamicDim(i)) dynDims.push_back(rewriter.create(loc, input, i)); } } // First fill the output buffer with the init value. auto emptyTensor = rewriter .create(loc, reduceShape, resultTy.getElementType(), dynDims) .getResult(); auto fillValueAttr = createInitialValueForReduceOp(op, elementTy, rewriter); if (!fillValueAttr) return rewriter.notifyMatchFailure( op, "No initial value found for reduction operation"); auto fillValue = rewriter.create(loc, fillValueAttr); auto filledTensor = rewriter .create(loc, ValueRange{fillValue}, ValueRange{emptyTensor}) .result(); bool didEncounterError = false; auto linalgOp = rewriter.create( loc, input, filledTensor, axis, [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange blockArgs) { auto result = createLinalgBodyCalculationForReduceOp( op, blockArgs, elementTy, rewriter); if (result) didEncounterError = true; nestedBuilder.create(loc, result); }); if (!didEncounterError) return rewriter.notifyMatchFailure( op, "unable to create linalg.generic body for reduce op"); SmallVector reassociationMap; uint64_t expandInputRank = cast(linalgOp.getResults()[0].getType()).getRank(); reassociationMap.resize(expandInputRank); for (uint64_t i = 0; i < expandInputRank; i++) { int32_t dimToPush = i > axis ? i + 1 : i; reassociationMap[i].push_back(rewriter.getAffineDimExpr(dimToPush)); } if (expandInputRank != 0) { int32_t expandedDim = axis < expandInputRank ? axis : expandInputRank - 1; reassociationMap[expandedDim].push_back( rewriter.getAffineDimExpr(expandedDim + 1)); } // Lower directly to `tensor::ExpandShapeOp` instead of `tosa::ReshapeOp`, // since here we know which dimension to expand, and `tosa::ReshapeOp` would // not have access to such information. This matters when handling dynamically // sized tensors. rewriter.replaceOpWithNewOp( op, resultTy, linalgOp.getResults()[0], reassociationMap); return success(); } namespace { template class PointwiseConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(SrcOp op, PatternRewriter &rewriter) const final { return elementwiseMatchAndRewriteHelper(op, rewriter); } }; class RescaleConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(tosa::RescaleOp op, PatternRewriter &rewriter) const final { auto loc = op.getLoc(); auto input = op.getInput(); auto inputTy = cast(op.getInput().getType()); auto outputTy = cast(op.getOutput().getType()); unsigned rank = inputTy.getRank(); // This is an illegal configuration. terminate and log an error if (op.getDoubleRound() && !op.getScale32()) return rewriter.notifyMatchFailure( op, "tosa.rescale requires scale32 for double_round to be true"); SmallVector dynDims; for (int i = 0; i < outputTy.getRank(); i++) { if (outputTy.isDynamicDim(i)) { dynDims.push_back(rewriter.create(loc, input, i)); } } // The shift and multiplier values. SmallVector multiplierValues(op.getMultiplier()); SmallVector shiftValues(op.getShift()); // If we shift by more than the bitwidth, this just sets to 0. for (int i = 0, s = multiplierValues.size(); i < s; i++) { if (shiftValues[i] > 63) { shiftValues[i] = 0; multiplierValues[i] = 0; } } // Double round only occurs if shift is greater than 31, check that this // is ever true. bool doubleRound = op.getDoubleRound() && llvm::any_of(shiftValues, [](int32_t v) { return v > 31; }); SmallVector indexingMaps = { rewriter.getMultiDimIdentityMap(rank)}; SmallVector genericInputs = {input}; // If we are rescaling per-channel then we need to store the multiplier // values in a buffer. Value multiplierConstant; int64_t multiplierArg = 0; if (multiplierValues.size() == 1) { multiplierConstant = rewriter.create( loc, rewriter.getI32IntegerAttr(multiplierValues.front())); } else { SmallVector multiplierExprs{ rewriter.getAffineDimExpr(rank - 1)}; auto multiplierType = RankedTensorType::get({static_cast(multiplierValues.size())}, rewriter.getI32Type()); genericInputs.push_back(rewriter.create( loc, DenseIntElementsAttr::get(multiplierType, multiplierValues))); indexingMaps.push_back(AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, multiplierExprs, rewriter.getContext())); multiplierArg = indexingMaps.size() - 1; } // If we are rescaling per-channel then we need to store the shift // values in a buffer. Value shiftConstant; int64_t shiftArg = 0; if (shiftValues.size() == 1) { shiftConstant = rewriter.create( loc, rewriter.getI8IntegerAttr(shiftValues.front())); } else { SmallVector shiftExprs = { rewriter.getAffineDimExpr(rank - 1)}; auto shiftType = RankedTensorType::get({static_cast(shiftValues.size())}, rewriter.getIntegerType(8)); genericInputs.push_back(rewriter.create( loc, DenseIntElementsAttr::get(shiftType, shiftValues))); indexingMaps.push_back(AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, shiftExprs, rewriter.getContext())); shiftArg = indexingMaps.size() - 1; } // Indexing maps for output values. indexingMaps.push_back(rewriter.getMultiDimIdentityMap(rank)); // Construct the indexing maps needed for linalg.generic ops. Value emptyTensor = rewriter.create( loc, outputTy.getShape(), outputTy.getElementType(), ArrayRef({dynDims})); auto linalgOp = rewriter.create( loc, outputTy, genericInputs, ValueRange{emptyTensor}, indexingMaps, getNParallelLoopsAttrs(rank), [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange blockArgs) { Value value = blockArgs[0]; Type valueTy = value.getType(); // For now we do all of our math in 64-bit. This is not optimal but // should be correct for now, consider computing correct bit depth // later. int32_t inBitwidth = valueTy.getIntOrFloatBitWidth() > 32 ? 48 : 32; auto inputZp = createConstFromIntAttribute( op, "input_zp", nestedBuilder.getIntegerType(inBitwidth), nestedBuilder); auto outputZp = createConstFromIntAttribute( op, "output_zp", nestedBuilder.getI32Type(), nestedBuilder); Value multiplier = multiplierConstant ? multiplierConstant : blockArgs[multiplierArg]; Value shift = shiftConstant ? shiftConstant : blockArgs[shiftArg]; if (valueTy.getIntOrFloatBitWidth() < 32) { if (valueTy.isUnsignedInteger()) { value = nestedBuilder .create( nestedLoc, nestedBuilder.getIntegerType( valueTy.getIntOrFloatBitWidth()), value) .getResult(0); value = nestedBuilder.create( nestedLoc, nestedBuilder.getI32Type(), value); } else { value = nestedBuilder.create( nestedLoc, nestedBuilder.getI32Type(), value); } } value = nestedBuilder.create(nestedLoc, value, inputZp); value = nestedBuilder.create( loc, nestedBuilder.getI32Type(), value, multiplier, shift, nestedBuilder.getBoolAttr(doubleRound)); // Move to the new zero-point. value = nestedBuilder.create(nestedLoc, value, outputZp); // Saturate to the output size. IntegerType outIntType = cast(blockArgs.back().getType()); unsigned outBitWidth = outIntType.getWidth(); int32_t intMin = APInt::getSignedMinValue(outBitWidth).getSExtValue(); int32_t intMax = APInt::getSignedMaxValue(outBitWidth).getSExtValue(); // Unsigned integers have a difference output value. if (outIntType.isUnsignedInteger()) { intMin = 0; intMax = APInt::getMaxValue(outBitWidth).getZExtValue(); } auto intMinVal = nestedBuilder.create( loc, nestedBuilder.getI32IntegerAttr(intMin)); auto intMaxVal = nestedBuilder.create( loc, nestedBuilder.getI32IntegerAttr(intMax)); value = clampIntHelper(nestedLoc, value, intMinVal, intMaxVal, nestedBuilder); if (outIntType.getWidth() < 32) { value = nestedBuilder.create( nestedLoc, rewriter.getIntegerType(outIntType.getWidth()), value); if (outIntType.isUnsignedInteger()) { value = nestedBuilder .create(nestedLoc, outIntType, value) .getResult(0); } } nestedBuilder.create(loc, value); }); rewriter.replaceOp(op, linalgOp->getResults()); return success(); } }; // Handle the resize case where the input is a 1x1 image. This case // can entirely avoiding having extract operations which target much // more difficult to optimize away. class ResizeUnaryConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(tosa::ResizeOp op, PatternRewriter &rewriter) const final { Location loc = op.getLoc(); ImplicitLocOpBuilder builder(loc, rewriter); auto input = op.getInput(); auto inputTy = cast(input.getType()); auto resultTy = cast(op.getType()); const bool isBilinear = op.getMode() == "BILINEAR"; auto inputH = inputTy.getDimSize(1); auto inputW = inputTy.getDimSize(2); auto outputH = resultTy.getDimSize(1); auto outputW = resultTy.getDimSize(2); if (inputH != 1 || inputW != 1 || outputH != 1 || outputW != 1) return rewriter.notifyMatchFailure( op, "tosa.resize is not a pure 1x1->1x1 image operation"); // TODO(suderman): These string values should be declared the TOSA dialect. if (op.getMode() != "NEAREST_NEIGHBOR" && op.getMode() != "BILINEAR") return rewriter.notifyMatchFailure( op, "tosa.resize mode should be NEAREST_NEIGHBOR or BILINEAR"); if (inputTy == resultTy) { rewriter.replaceOp(op, input); return success(); } ArrayRef scale = op.getScale(); // Collapse the unit width and height away. SmallVector reassociationMap(2); reassociationMap[0].push_back(builder.getAffineDimExpr(0)); reassociationMap[1].push_back(builder.getAffineDimExpr(1)); reassociationMap[1].push_back(builder.getAffineDimExpr(2)); reassociationMap[1].push_back(builder.getAffineDimExpr(3)); auto collapseTy = RankedTensorType::get({inputTy.getDimSize(0), inputTy.getDimSize(3)}, inputTy.getElementType()); Value collapse = builder.create(collapseTy, input, reassociationMap); // Get any dynamic shapes that appear in the input format. llvm::SmallVector outputDynSize; if (inputTy.isDynamicDim(0)) outputDynSize.push_back(builder.create(input, 0)); if (inputTy.isDynamicDim(3)) outputDynSize.push_back(builder.create(input, 3)); // Generate the elementwise operation for casting scaling the input value. auto genericTy = collapseTy.clone(resultTy.getElementType()); Value empty = builder.create( genericTy.getShape(), resultTy.getElementType(), outputDynSize); auto genericMap = rewriter.getMultiDimIdentityMap(genericTy.getRank()); SmallVector iterators(genericTy.getRank(), utils::IteratorType::parallel); auto generic = builder.create( genericTy, ValueRange{collapse}, ValueRange{empty}, ArrayRef{genericMap, genericMap}, iterators, [=](OpBuilder &b, Location loc, ValueRange args) { Value value = args[0]; // This is the quantized case. if (inputTy.getElementType() != resultTy.getElementType()) { value = b.create(loc, resultTy.getElementType(), value); if (isBilinear && scale[0] != 0) { Value scaleY = b.create( loc, b.getI32IntegerAttr(scale[0])); value = b.create