bolt/deps/llvm-18.1.8/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp

//===- ShardingInterface.cpp -------------------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
#include "mlir/Dialect/Mesh/IR/MeshOps.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/Debug.h"

#include <algorithm>
#include <utility>

#define DEBUG_TYPE "sharding-interface"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

using namespace mlir;
using namespace mlir::mesh;

#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.cpp.inc"

//===----------------------------------------------------------------------===//
// common util functions
//===----------------------------------------------------------------------===//

static LogicalResult
checkOperandAffineExprRecursively(AffineExpr expr,
                                  SmallVectorImpl<bool> &seenIds) {
  switch (expr.getKind()) {
  case AffineExprKind::Add: {
    auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
    AffineExpr lhs = binOpExpr.getLHS();
    AffineExpr rhs = binOpExpr.getRHS();
    if (failed(checkOperandAffineExprRecursively(lhs, seenIds)))
      return failure();
    if (failed(checkOperandAffineExprRecursively(rhs, seenIds)))
      return failure();
    return success();
  }
  case AffineExprKind::Mul: {
    auto binOpExpr = cast<AffineBinaryOpExpr>(expr);
    AffineExpr lhs = binOpExpr.getLHS();
    AffineExpr rhs = binOpExpr.getRHS();
    AffineExpr dimExpr;
    if (lhs.getKind() == AffineExprKind::DimId &&
        rhs.getKind() == AffineExprKind::Constant) {
      dimExpr = lhs;
    } else if (rhs.getKind() == AffineExprKind::DimId &&
               lhs.getKind() == AffineExprKind::Constant) {
      dimExpr = rhs;
    } else
      return failure();
    unsigned position = cast<AffineDimExpr>(dimExpr).getPosition();
    if ((size_t)position >= seenIds.size() || seenIds[position])
      return failure();
    seenIds[position] = true;
    return success();
  }
  case AffineExprKind::DimId: {
    unsigned position = cast<AffineDimExpr>(expr).getPosition();
    if ((size_t)position >= seenIds.size() || seenIds[position])
      return failure();
    seenIds[position] = true;
    return success();
  }
  default:
    return failure();
  }
}

static FailureOr<llvm::SmallSet<unsigned, 2>>
checkOperandAffineExpr(AffineExpr expr, unsigned numDims) {
  SmallVector<bool> seenIds(numDims, false);
  if (failed(checkOperandAffineExprRecursively(expr, seenIds)))
    return failure();

  llvm::SmallSet<unsigned, 2> positions;
  for (auto it : llvm::enumerate(seenIds)) {
    if (it.value())
      positions.insert((unsigned)it.index());
  }
  return positions;
}

//===----------------------------------------------------------------------===//
// mesh::getMeshShardingAttr
//===----------------------------------------------------------------------===//

FailureOr<std::pair<bool, MeshShardingAttr>>
mesh::getMeshShardingAttr(OpResult result) {
  Value val = result.cast<Value>();
  bool anyShardedForDef = llvm::any_of(val.getUsers(), [](Operation *user) {
    auto shardOp = llvm::dyn_cast<mesh::ShardOp>(user);
    if (!shardOp)
      return false;
    return !shardOp.getAnnotateForUsers();
  });

  if (anyShardedForDef) {
    // expected to have exact one use if it has a use of `mesh.shard` without
    // unit attr annotate_for_users
    if (!val.hasOneUse())
      return failure();
    auto shardOp = llvm::cast<mesh::ShardOp>(*val.getUsers().begin());
    return std::make_pair(false, shardOp.getShard());
  }

  bool anyShardedForUsers = llvm::any_of(val.getUsers(), [](Operation *user) {
    auto shardOp = llvm::dyn_cast<mesh::ShardOp>(user);
    if (!shardOp)
      return false;
    return shardOp.getAnnotateForUsers();
  });
  if (anyShardedForUsers) {
    SmallVector<ShardOp> shardOps;
    for (Operation *user : val.getUsers()) {
      ShardOp shardOp = llvm::dyn_cast<ShardOp>(user);
      if (shardOp)
        shardOps.push_back(shardOp);
    }
    MeshShardingAttr shardForDef = shardOps[0].getShard();
    for (size_t i = 1; i < shardOps.size(); ++i) {
      // TODO: Deduce a reasonable mesh sharding attr for def when they are
      // different
      assert(shardOps[i].getShard() == shardForDef &&
             "only support all shard ops have the same mesh sharding attr");
    }
    return std::make_pair(true, shardForDef);
  }
  return failure();
}

FailureOr<std::pair<bool, MeshShardingAttr>>
mesh::getMeshShardingAttr(OpOperand &opOperand) {
  Value val = opOperand.get();
  if (ShardOp shardOp = val.getDefiningOp<ShardOp>())
    return std::make_pair(shardOp.getAnnotateForUsers(), shardOp.getShard());

  return failure();
}

//===----------------------------------------------------------------------===//
// ShardingInterface::verifyShardingInterfaceImpl
//===----------------------------------------------------------------------===//

LogicalResult mesh::ShardingInterface::verifyShardingInterfaceImpl() {
  Operation *op = getOperation();

  // check operands and results type
  for (Type type : op->getOperandTypes())
    if (!llvm::isa<RankedTensorType>(type))
      return failure();
  for (Type type : op->getResultTypes())
    if (!llvm::isa<RankedTensorType>(type))
      return failure();

  // check loop types
  SmallVector<IteratorType> loopTypes = getLoopIteratorTypes();
  if (loopTypes.size() == 0)
    return failure();

  // check maps
  SmallVector<AffineMap> maps = getIndexingMaps();
  if (maps.size() == 0)
    return failure();
  unsigned numOperands = op->getNumOperands();
  unsigned numResults = op->getNumResults();
  if (numOperands + numResults != maps.size())
    return failure();

  for (OpResult result : op->getResults()) {
    auto resultType = result.getType().dyn_cast<RankedTensorType>();
    if (!resultType)
      return failure();
    AffineMap map = maps[numOperands + result.getResultNumber()];
    if (!map.isProjectedPermutation()) {
      return failure();
    }
  }

  return success();
}

//===----------------------------------------------------------------------===//
// ShardingInterface::printLoopTypesAndIndexingMaps
//===----------------------------------------------------------------------===//

void mesh::ShardingInterface::printLoopTypesAndIndexingMaps(raw_ostream &os) {
  os << "print loop types and indexing maps for: \n";
  getOperation()->print(os);
  os << "\n";
  os << "loop types: [";
  for (IteratorType type : getLoopIteratorTypes()) {
    os << stringifyEnum(type) << " ";
  }
  os << "]\n";
  os << "indexing maps: \n";
  for (AffineMap map : getIndexingMaps())
    os << map << "\n";
  os << "\n";
}

//===----------------------------------------------------------------------===//
// detail::defaultGetShardingOption
//===----------------------------------------------------------------------===//

namespace {

// Update the given `shardingOption` according to `meshAxes` and `loopIdx`
static LogicalResult fillShardingOption(Operation *op,
                                        ShardingOption &shardingOption,
                                        FlatSymbolRefAttr cluster,
                                        ArrayRef<MeshAxis> meshAxes,
                                        unsigned loopIdx) {
  if ((shardingOption.cluster && cluster &&
       shardingOption.cluster != cluster) ||
      (!shardingOption.shardingArray[loopIdx].empty() &&
       shardingOption.shardingArray[loopIdx] != meshAxes)) {
    LLVM_DEBUG(DBGS() << "sharding option conflicts on loop iterator "
                      << loopIdx << "\n");
    return failure();
  }
  for (size_t i = 0; i < shardingOption.shardingArray.size(); ++i) {
    if (i == loopIdx)
      continue;

    for (MeshAxis axis : meshAxes) {
      if (llvm::is_contained(shardingOption.shardingArray[i], axis)) {
        LLVM_DEBUG(DBGS() << "sharding option conflicts because mesh axes "
                          << axis << " duplicate");
        return failure();
      }
    }
  }
  if (cluster)
    shardingOption.cluster = cluster;
  if (shardingOption.shardingArray[loopIdx].empty())
    shardingOption.shardingArray[loopIdx].append(meshAxes.begin(),
                                                 meshAxes.end());
  return success();
}

} // namespace

FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
    Operation *op, ArrayRef<MeshShardingAttr> operandShardings,
    ArrayRef<MeshShardingAttr> resultShardings) {
  ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);
  ShardingOption shardingOption;

  if (failed(shardingOp.verifyShardingInterfaceImpl()))
    return op->emitOpError() << "invalid sharding interface implementation";
  SmallVector<IteratorType> loopTypes = shardingOp.getLoopIteratorTypes();
  SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();
  unsigned numOperands = op->getNumOperands();
  shardingOption.shardingArray.resize(loopTypes.size());
  llvm::SmallVector<MeshAxis> partialMeshAxes;
  Partial partialType;
  llvm::SmallSet<unsigned, 4> visitedLoopIndices;
  bool anyShardingInResultsOrOperands = false;

  // 1. Fill sharding option based on op results
  for (auto shardingIt : llvm::enumerate(resultShardings)) {
    MeshShardingAttr shardAttr = shardingIt.value();
    if (!shardAttr)
      continue;
    AffineMap map = maps[numOperands + shardingIt.index()];
    anyShardingInResultsOrOperands = true;
    // Handle the split axes: calculate the corresponding loop index for each
    // split axes sub-array, and then store the sub-array to
    // shardingOption[index]
    for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) {
      AffineExpr expr = std::get<0>(it);
      ArrayRef<MeshAxis> axes = std::get<1>(it).asArrayRef();
      auto dim = cast<AffineDimExpr>(expr);
      unsigned index = dim.getPosition();
      visitedLoopIndices.insert(index);
      if (failed(fillShardingOption(op, shardingOption, shardAttr.getCluster(),
                                    axes, index)))
        return failure();
    }

    // Handle the partial axes: at this stage, the exact loop index/indices
    // cannot be decided because there could be multiple reduction loops.
    ArrayRef<MeshAxis> partialAxes = shardAttr.getPartialAxes();
    if (!partialAxes.empty()) {
      if (!partialMeshAxes.empty())
        return op->emitOpError() << "at most one result with partial axes is "
                                    "supported at present";
      partialType = shardAttr.getPartialType();
      partialMeshAxes.append(partialAxes.begin(), partialAxes.end());
      // Add all the reduction loop indices to `visitedLoopIndices` if
      // `partialAxes` is not empty
      for (size_t loopIdx = 0; loopIdx < loopTypes.size(); ++loopIdx) {
        if (isReductionLoop(loopTypes[loopIdx]))
          visitedLoopIndices.insert(loopIdx);
      }
    }
  }

  // 2. Fill sharding option based on operands
  for (auto shardingIt : llvm::enumerate(operandShardings)) {
    MeshShardingAttr shardAttr = shardingIt.value();
    if (!shardAttr)
      continue;

    anyShardingInResultsOrOperands = true;
    AffineMap map = maps[shardingIt.index()];
    unsigned numDims = map.getNumDims();

    // Handle the split axes. Partial axes don't need to be handled because they
    // only affect the defining op of the operand.
    //
    // TODO: Change to process the operands with single loop index first and
    // then the operands with multiple loop indices.
    for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) {
      AffineExpr expr = std::get<0>(it);
      ArrayRef<MeshAxis> axes = std::get<1>(it).asArrayRef();
      FailureOr<llvm::SmallSet<unsigned, 2>> loopIndices =
          checkOperandAffineExpr(expr, numDims);
      if (failed(loopIndices))
        return op->emitOpError()
               << "operand's affine expression is restricted to const_i * "
                  "dim_i + const_j + dim_j + ...";
      if (loopIndices->empty())
        continue;
      if (loopIndices->size() == 1) {
        unsigned loopIdx = *loopIndices->begin();
        visitedLoopIndices.insert(loopIdx);
        if (failed(fillShardingOption(op, shardingOption,
                                      shardAttr.getCluster(), axes, loopIdx)))
          return failure();
      }
      // If multiple loop indices correspond to a dimension of an operand, it is
      // difficult to infer which loop indices are responsible for sharding.
      // Therefore, the exact loop index must be specified by others.
      if (loopIndices->size() > 1) {
        bool seenLoopIndices = false;
        for (unsigned loopIdx : *loopIndices) {
          if (visitedLoopIndices.contains(loopIdx)) {
            seenLoopIndices = true;
            break;
          }
        }
        if (!seenLoopIndices)
          return op->emitOpError()
                 << "the operand " << shardingIt.index()
                 << " has multiple loop indices in a dimension, but none of "
                    "them could be found in the exactly specified annotation "
                    "of op results or operands.";
      }
    }
  }

  // 3. Finalize sharding option
  if (!partialMeshAxes.empty()) {
    bool anyNonEmptyReductionLoop = llvm::any_of(
        llvm::enumerate(shardingOption.shardingArray), [&](auto it) {
          SmallVector<MeshAxis> &subArray = it.value();
          int64_t idx = it.index();
          return isReductionLoop(loopTypes[idx]) && !subArray.empty();
        });
    if (!anyNonEmptyReductionLoop) {
      bool filled = false;
      for (size_t idx = 0; idx < loopTypes.size(); ++idx) {
        if (isReductionLoop(loopTypes[idx]) &&
            areReductionAndPartialMatch(loopTypes[idx], partialType)) {
          std::ignore = fillShardingOption(op, shardingOption, nullptr,
                                           partialMeshAxes, idx);
          filled = true;
          break;
        }
      }
      if (!filled)
        return op->emitOpError() << "no matched reduction loop found for the "
                                    "result's partial type";
    }
  }
  removeTrailingEmptySubArray(shardingOption.shardingArray);
  if (!anyShardingInResultsOrOperands)
    shardingOption.empty = true;
  return shardingOption;
}

//===----------------------------------------------------------------------===//
// detail::defaultAddShardingAnnotations
//===----------------------------------------------------------------------===//

namespace {

// To add a `mesh.shard` op for the given result, based on the details provided
// in `shardingOption`, `map`, and `loopTypes`.
static LogicalResult addShardOp(OpBuilder &b, OpResult result,
                                const ShardingOption &shardingOption,
                                AffineMap map,
                                ArrayRef<IteratorType> loopTypes) {
  FailureOr<std::pair<bool, MeshShardingAttr>> maybeSharding =
      getMeshShardingAttr(result);
  if (succeeded(maybeSharding) && !maybeSharding->first)
    return success();

  auto resultType = result.getType().cast<RankedTensorType>();
  SmallVector<SmallVector<MeshAxis>> splitAxes(resultType.getRank());
  SmallVector<MeshAxis> partialAxes;

  // process the split axes
  for (auto it : llvm::enumerate(map.getResults())) {
    AffineExpr expr = it.value();
    // `expr` must be an `AffineDimExpr` because `map` is verified by
    // isProjectedPermutation
    auto dim = cast<AffineDimExpr>(expr);
    unsigned loopIdx = dim.getPosition();
    if (loopIdx < shardingOption.shardingArray.size())
      splitAxes[it.index()].append(shardingOption.shardingArray[loopIdx]);
  }

  // process the partial axes
  // partialType will be ignored if partialAxes is empty
  Partial partialType = Partial::Sum;
  for (auto it : llvm::zip(loopTypes, shardingOption.shardingArray)) {
    IteratorType iType = std::get<0>(it);
    if (isReductionLoop(iType)) {
      Partial curPartialType = getPartialTypeFromReduction(iType);
      if (!partialAxes.empty())
        assert(partialType == curPartialType &&
               "Only one reduction type is supported");
      partialType = curPartialType;
      const SmallVector<MeshAxis> &axis = std::get<1>(it);
      partialAxes.append(axis);
    }
  }

  removeTrailingEmptySubArray(splitAxes);
  MeshShardingAttr shardAttr =
      MeshShardingAttr::get(b.getContext(), shardingOption.cluster, splitAxes,
                            partialAxes, partialType);
  OpBuilder::InsertionGuard guard(b);
  b.setInsertionPointAfterValue(result);
  auto shardOp = b.create<ShardOp>(result.getLoc(), resultType, result,
                                   shardAttr, /*annotate_for_users*/ false);
  result.replaceAllUsesExcept(shardOp, shardOp);
  return success();
}

// To add a `mesh.shard` op for the given operand, based on the details provided
// in `shardingOption`, `map`, and `loopTypes`.
static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
                                const ShardingOption &shardingOption,
                                AffineMap map,
                                ArrayRef<IteratorType> loopTypes) {
  auto maybeShardingAttr = getMeshShardingAttr(opOperand);
  if (succeeded(maybeShardingAttr) && maybeShardingAttr->first)
    return success();
  Value operand = opOperand.get();
  auto operandType = operand.getType().cast<RankedTensorType>();
  SmallVector<SmallVector<MeshAxis>> splitAxes(operandType.getRank());
  unsigned numDims = map.getNumDims();
  for (auto it : llvm::enumerate(map.getResults())) {
    int64_t idx = it.index();
    AffineExpr expr = it.value();
    FailureOr<llvm::SmallSet<unsigned, 2>> loopIndices =
        checkOperandAffineExpr(expr, numDims);
    if (failed(loopIndices))
      return failure();
    SmallVector<unsigned> shardedLoopIndices;
    for (unsigned loopIdx : *loopIndices) {
      if ((size_t)loopIdx < shardingOption.shardingArray.size() &&
          !shardingOption.shardingArray[loopIdx].empty())
        shardedLoopIndices.push_back(loopIdx);
    }
    // mostly one sharded loop index is accepted
    if (shardedLoopIndices.size() > 1)
      return failure();
    if (shardedLoopIndices.size() == 1) {
      splitAxes[idx].append(
          shardingOption.shardingArray[shardedLoopIndices[0]]);
    }
  }

  removeTrailingEmptySubArray(splitAxes);
  MeshShardingAttr shardAttr =
      MeshShardingAttr::get(b.getContext(), shardingOption.cluster, splitAxes);
  OpBuilder::InsertionGuard guard(b);
  b.setInsertionPoint(opOperand.getOwner());
  auto shardOp = b.create<ShardOp>(operand.getLoc(), operandType, operand,
                                   shardAttr, true);
  opOperand.set(shardOp);

  return success();
}

} // namespace

LogicalResult mesh::detail::defaultAddShardingAnnotations(
    Operation *op, OpBuilder &b, const ShardingOption &shardingOption) {
  ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);
  SmallVector<IteratorType> loopTypes = shardingOp.getLoopIteratorTypes();
  SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();
  unsigned numOperands = op->getNumOperands();

  // 1. add mesh.shard ops for all op results
  for (OpResult result : op->getResults()) {
    if (failed(addShardOp(b, result, shardingOption,
                          maps[numOperands + result.getResultNumber()],
                          loopTypes)))
      return failure();
  }

  // 2. add mesh.shard ops for all operands
  for (OpOperand &opOperand : op->getOpOperands()) {
    if (failed(addShardOp(b, opOperand, shardingOption,
                          maps[opOperand.getOperandNumber()], loopTypes)))
      return failure();
  }

  return success();
}
Embed LLVM 18.1.8 2025-02-14 19:21:04 +01:00			`//===- ShardingInterface.cpp -------------------------------------- C++--===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"`
			`#include "mlir/Dialect/Mesh/IR/MeshOps.h"`
			`#include "mlir/Dialect/Utils/IndexingUtils.h"`
			`#include "mlir/IR/AffineMap.h"`
			`#include "mlir/Support/LLVM.h"`
			`#include "llvm/ADT/SmallSet.h"`
			`#include "llvm/Support/Debug.h"`

			`#include <algorithm>`
			`#include <utility>`

			`#define DEBUG_TYPE "sharding-interface"`
			`#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")`

			`using namespace mlir;`
			`using namespace mlir::mesh;`

			`#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.cpp.inc"`

			`//===----------------------------------------------------------------------===//`
			`// common util functions`
			`//===----------------------------------------------------------------------===//`

			`static LogicalResult`
			`checkOperandAffineExprRecursively(AffineExpr expr,`
			`SmallVectorImpl<bool> &seenIds) {`
			`switch (expr.getKind()) {`
			`case AffineExprKind::Add: {`
			`auto binOpExpr = cast<AffineBinaryOpExpr>(expr);`
			`AffineExpr lhs = binOpExpr.getLHS();`
			`AffineExpr rhs = binOpExpr.getRHS();`
			`if (failed(checkOperandAffineExprRecursively(lhs, seenIds)))`
			`return failure();`
			`if (failed(checkOperandAffineExprRecursively(rhs, seenIds)))`
			`return failure();`
			`return success();`
			`}`
			`case AffineExprKind::Mul: {`
			`auto binOpExpr = cast<AffineBinaryOpExpr>(expr);`
			`AffineExpr lhs = binOpExpr.getLHS();`
			`AffineExpr rhs = binOpExpr.getRHS();`
			`AffineExpr dimExpr;`
			`if (lhs.getKind() == AffineExprKind::DimId &&`
			`rhs.getKind() == AffineExprKind::Constant) {`
			`dimExpr = lhs;`
			`} else if (rhs.getKind() == AffineExprKind::DimId &&`
			`lhs.getKind() == AffineExprKind::Constant) {`
			`dimExpr = rhs;`
			`} else`
			`return failure();`
			`unsigned position = cast<AffineDimExpr>(dimExpr).getPosition();`
			`if ((size_t)position >= seenIds.size() \|\| seenIds[position])`
			`return failure();`
			`seenIds[position] = true;`
			`return success();`
			`}`
			`case AffineExprKind::DimId: {`
			`unsigned position = cast<AffineDimExpr>(expr).getPosition();`
			`if ((size_t)position >= seenIds.size() \|\| seenIds[position])`
			`return failure();`
			`seenIds[position] = true;`
			`return success();`
			`}`
			`default:`
			`return failure();`
			`}`
			`}`

			`static FailureOr<llvm::SmallSet<unsigned, 2>>`
			`checkOperandAffineExpr(AffineExpr expr, unsigned numDims) {`
			`SmallVector<bool> seenIds(numDims, false);`
			`if (failed(checkOperandAffineExprRecursively(expr, seenIds)))`
			`return failure();`

			`llvm::SmallSet<unsigned, 2> positions;`
			`for (auto it : llvm::enumerate(seenIds)) {`
			`if (it.value())`
			`positions.insert((unsigned)it.index());`
			`}`
			`return positions;`
			`}`

			`//===----------------------------------------------------------------------===//`
			`// mesh::getMeshShardingAttr`
			`//===----------------------------------------------------------------------===//`

			`FailureOr<std::pair<bool, MeshShardingAttr>>`
			`mesh::getMeshShardingAttr(OpResult result) {`
			`Value val = result.cast<Value>();`
			`bool anyShardedForDef = llvm::any_of(val.getUsers(), [](Operation *user) {`
			`auto shardOp = llvm::dyn_cast<mesh::ShardOp>(user);`
			`if (!shardOp)`
			`return false;`
			`return !shardOp.getAnnotateForUsers();`
			`});`

			`if (anyShardedForDef) {`
			// expected to have exact one use if it has a use of `mesh.shard` without
			`// unit attr annotate_for_users`
			`if (!val.hasOneUse())`
			`return failure();`
			`auto shardOp = llvm::cast<mesh::ShardOp>(*val.getUsers().begin());`
			`return std::make_pair(false, shardOp.getShard());`
			`}`

			`bool anyShardedForUsers = llvm::any_of(val.getUsers(), [](Operation *user) {`
			`auto shardOp = llvm::dyn_cast<mesh::ShardOp>(user);`
			`if (!shardOp)`
			`return false;`
			`return shardOp.getAnnotateForUsers();`
			`});`
			`if (anyShardedForUsers) {`
			`SmallVector<ShardOp> shardOps;`
			`for (Operation *user : val.getUsers()) {`
			`ShardOp shardOp = llvm::dyn_cast<ShardOp>(user);`
			`if (shardOp)`
			`shardOps.push_back(shardOp);`
			`}`
			`MeshShardingAttr shardForDef = shardOps[0].getShard();`
			`for (size_t i = 1; i < shardOps.size(); ++i) {`
			`// TODO: Deduce a reasonable mesh sharding attr for def when they are`
			`// different`
			`assert(shardOps[i].getShard() == shardForDef &&`
			`"only support all shard ops have the same mesh sharding attr");`
			`}`
			`return std::make_pair(true, shardForDef);`
			`}`
			`return failure();`
			`}`

			`FailureOr<std::pair<bool, MeshShardingAttr>>`
			`mesh::getMeshShardingAttr(OpOperand &opOperand) {`
			`Value val = opOperand.get();`
			`if (ShardOp shardOp = val.getDefiningOp<ShardOp>())`
			`return std::make_pair(shardOp.getAnnotateForUsers(), shardOp.getShard());`

			`return failure();`
			`}`

			`//===----------------------------------------------------------------------===//`
			`// ShardingInterface::verifyShardingInterfaceImpl`
			`//===----------------------------------------------------------------------===//`

			`LogicalResult mesh::ShardingInterface::verifyShardingInterfaceImpl() {`
			`Operation *op = getOperation();`

			`// check operands and results type`
			`for (Type type : op->getOperandTypes())`
			`if (!llvm::isa<RankedTensorType>(type))`
			`return failure();`
			`for (Type type : op->getResultTypes())`
			`if (!llvm::isa<RankedTensorType>(type))`
			`return failure();`

			`// check loop types`
			`SmallVector<IteratorType> loopTypes = getLoopIteratorTypes();`
			`if (loopTypes.size() == 0)`
			`return failure();`

			`// check maps`
			`SmallVector<AffineMap> maps = getIndexingMaps();`
			`if (maps.size() == 0)`
			`return failure();`
			`unsigned numOperands = op->getNumOperands();`
			`unsigned numResults = op->getNumResults();`
			`if (numOperands + numResults != maps.size())`
			`return failure();`

			`for (OpResult result : op->getResults()) {`
			`auto resultType = result.getType().dyn_cast<RankedTensorType>();`
			`if (!resultType)`
			`return failure();`
			`AffineMap map = maps[numOperands + result.getResultNumber()];`
			`if (!map.isProjectedPermutation()) {`
			`return failure();`
			`}`
			`}`

			`return success();`
			`}`

			`//===----------------------------------------------------------------------===//`
			`// ShardingInterface::printLoopTypesAndIndexingMaps`
			`//===----------------------------------------------------------------------===//`

			`void mesh::ShardingInterface::printLoopTypesAndIndexingMaps(raw_ostream &os) {`
			`os << "print loop types and indexing maps for: \n";`
			`getOperation()->print(os);`
			`os << "\n";`
			`os << "loop types: [";`
			`for (IteratorType type : getLoopIteratorTypes()) {`
			`os << stringifyEnum(type) << " ";`
			`}`
			`os << "]\n";`
			`os << "indexing maps: \n";`
			`for (AffineMap map : getIndexingMaps())`
			`os << map << "\n";`
			`os << "\n";`
			`}`

			`//===----------------------------------------------------------------------===//`
			`// detail::defaultGetShardingOption`
			`//===----------------------------------------------------------------------===//`

			`namespace {`

			// Update the given `shardingOption` according to `meshAxes` and `loopIdx`
			`static LogicalResult fillShardingOption(Operation *op,`
			`ShardingOption &shardingOption,`
			`FlatSymbolRefAttr cluster,`
			`ArrayRef<MeshAxis> meshAxes,`
			`unsigned loopIdx) {`
			`if ((shardingOption.cluster && cluster &&`
			`shardingOption.cluster != cluster) \|\|`
			`(!shardingOption.shardingArray[loopIdx].empty() &&`
			`shardingOption.shardingArray[loopIdx] != meshAxes)) {`
			`LLVM_DEBUG(DBGS() << "sharding option conflicts on loop iterator "`
			`<< loopIdx << "\n");`
			`return failure();`
			`}`
			`for (size_t i = 0; i < shardingOption.shardingArray.size(); ++i) {`
			`if (i == loopIdx)`
			`continue;`

			`for (MeshAxis axis : meshAxes) {`
			`if (llvm::is_contained(shardingOption.shardingArray[i], axis)) {`
			`LLVM_DEBUG(DBGS() << "sharding option conflicts because mesh axes "`
			`<< axis << " duplicate");`
			`return failure();`
			`}`
			`}`
			`}`
			`if (cluster)`
			`shardingOption.cluster = cluster;`
			`if (shardingOption.shardingArray[loopIdx].empty())`
			`shardingOption.shardingArray[loopIdx].append(meshAxes.begin(),`
			`meshAxes.end());`
			`return success();`
			`}`

			`} // namespace`

			`FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(`
			`Operation *op, ArrayRef<MeshShardingAttr> operandShardings,`
			`ArrayRef<MeshShardingAttr> resultShardings) {`
			`ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);`
			`ShardingOption shardingOption;`

			`if (failed(shardingOp.verifyShardingInterfaceImpl()))`
			`return op->emitOpError() << "invalid sharding interface implementation";`
			`SmallVector<IteratorType> loopTypes = shardingOp.getLoopIteratorTypes();`
			`SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();`
			`unsigned numOperands = op->getNumOperands();`
			`shardingOption.shardingArray.resize(loopTypes.size());`
			`llvm::SmallVector<MeshAxis> partialMeshAxes;`
			`Partial partialType;`
			`llvm::SmallSet<unsigned, 4> visitedLoopIndices;`
			`bool anyShardingInResultsOrOperands = false;`

			`// 1. Fill sharding option based on op results`
			`for (auto shardingIt : llvm::enumerate(resultShardings)) {`
			`MeshShardingAttr shardAttr = shardingIt.value();`
			`if (!shardAttr)`
			`continue;`
			`AffineMap map = maps[numOperands + shardingIt.index()];`
			`anyShardingInResultsOrOperands = true;`
			`// Handle the split axes: calculate the corresponding loop index for each`
			`// split axes sub-array, and then store the sub-array to`
			`// shardingOption[index]`
			`for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) {`
			`AffineExpr expr = std::get<0>(it);`
			`ArrayRef<MeshAxis> axes = std::get<1>(it).asArrayRef();`
			`auto dim = cast<AffineDimExpr>(expr);`
			`unsigned index = dim.getPosition();`
			`visitedLoopIndices.insert(index);`
			`if (failed(fillShardingOption(op, shardingOption, shardAttr.getCluster(),`
			`axes, index)))`
			`return failure();`
			`}`

			`// Handle the partial axes: at this stage, the exact loop index/indices`
			`// cannot be decided because there could be multiple reduction loops.`
			`ArrayRef<MeshAxis> partialAxes = shardAttr.getPartialAxes();`
			`if (!partialAxes.empty()) {`
			`if (!partialMeshAxes.empty())`
			`return op->emitOpError() << "at most one result with partial axes is "`
			`"supported at present";`
			`partialType = shardAttr.getPartialType();`
			`partialMeshAxes.append(partialAxes.begin(), partialAxes.end());`
			// Add all the reduction loop indices to `visitedLoopIndices` if
			// `partialAxes` is not empty
			`for (size_t loopIdx = 0; loopIdx < loopTypes.size(); ++loopIdx) {`
			`if (isReductionLoop(loopTypes[loopIdx]))`
			`visitedLoopIndices.insert(loopIdx);`
			`}`
			`}`
			`}`

			`// 2. Fill sharding option based on operands`
			`for (auto shardingIt : llvm::enumerate(operandShardings)) {`
			`MeshShardingAttr shardAttr = shardingIt.value();`
			`if (!shardAttr)`
			`continue;`

			`anyShardingInResultsOrOperands = true;`
			`AffineMap map = maps[shardingIt.index()];`
			`unsigned numDims = map.getNumDims();`

			`// Handle the split axes. Partial axes don't need to be handled because they`
			`// only affect the defining op of the operand.`
			`//`
			`// TODO: Change to process the operands with single loop index first and`
			`// then the operands with multiple loop indices.`
			`for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) {`
			`AffineExpr expr = std::get<0>(it);`
			`ArrayRef<MeshAxis> axes = std::get<1>(it).asArrayRef();`
			`FailureOr<llvm::SmallSet<unsigned, 2>> loopIndices =`
			`checkOperandAffineExpr(expr, numDims);`
			`if (failed(loopIndices))`
			`return op->emitOpError()`
			`<< "operand's affine expression is restricted to const_i * "`
			`"dim_i + const_j + dim_j + ...";`
			`if (loopIndices->empty())`
			`continue;`
			`if (loopIndices->size() == 1) {`
			`unsigned loopIdx = *loopIndices->begin();`
			`visitedLoopIndices.insert(loopIdx);`
			`if (failed(fillShardingOption(op, shardingOption,`
			`shardAttr.getCluster(), axes, loopIdx)))`
			`return failure();`
			`}`
			`// If multiple loop indices correspond to a dimension of an operand, it is`
			`// difficult to infer which loop indices are responsible for sharding.`
			`// Therefore, the exact loop index must be specified by others.`
			`if (loopIndices->size() > 1) {`
			`bool seenLoopIndices = false;`
			`for (unsigned loopIdx : *loopIndices) {`
			`if (visitedLoopIndices.contains(loopIdx)) {`
			`seenLoopIndices = true;`
			`break;`
			`}`
			`}`
			`if (!seenLoopIndices)`
			`return op->emitOpError()`
			`<< "the operand " << shardingIt.index()`
			`<< " has multiple loop indices in a dimension, but none of "`
			`"them could be found in the exactly specified annotation "`
			`"of op results or operands.";`
			`}`
			`}`
			`}`

			`// 3. Finalize sharding option`
			`if (!partialMeshAxes.empty()) {`
			`bool anyNonEmptyReductionLoop = llvm::any_of(`
			`llvm::enumerate(shardingOption.shardingArray), [&](auto it) {`
			`SmallVector<MeshAxis> &subArray = it.value();`
			`int64_t idx = it.index();`
			`return isReductionLoop(loopTypes[idx]) && !subArray.empty();`
			`});`
			`if (!anyNonEmptyReductionLoop) {`
			`bool filled = false;`
			`for (size_t idx = 0; idx < loopTypes.size(); ++idx) {`
			`if (isReductionLoop(loopTypes[idx]) &&`
			`areReductionAndPartialMatch(loopTypes[idx], partialType)) {`
			`std::ignore = fillShardingOption(op, shardingOption, nullptr,`
			`partialMeshAxes, idx);`
			`filled = true;`
			`break;`
			`}`
			`}`
			`if (!filled)`
			`return op->emitOpError() << "no matched reduction loop found for the "`
			`"result's partial type";`
			`}`
			`}`
			`removeTrailingEmptySubArray(shardingOption.shardingArray);`
			`if (!anyShardingInResultsOrOperands)`
			`shardingOption.empty = true;`
			`return shardingOption;`
			`}`

			`//===----------------------------------------------------------------------===//`
			`// detail::defaultAddShardingAnnotations`
			`//===----------------------------------------------------------------------===//`

			`namespace {`

			// To add a `mesh.shard` op for the given result, based on the details provided
			// in `shardingOption`, `map`, and `loopTypes`.
			`static LogicalResult addShardOp(OpBuilder &b, OpResult result,`
			`const ShardingOption &shardingOption,`
			`AffineMap map,`
			`ArrayRef<IteratorType> loopTypes) {`
			`FailureOr<std::pair<bool, MeshShardingAttr>> maybeSharding =`
			`getMeshShardingAttr(result);`
			`if (succeeded(maybeSharding) && !maybeSharding->first)`
			`return success();`

			`auto resultType = result.getType().cast<RankedTensorType>();`
			`SmallVector<SmallVector<MeshAxis>> splitAxes(resultType.getRank());`
			`SmallVector<MeshAxis> partialAxes;`

			`// process the split axes`
			`for (auto it : llvm::enumerate(map.getResults())) {`
			`AffineExpr expr = it.value();`
			// `expr` must be an `AffineDimExpr` because `map` is verified by
			`// isProjectedPermutation`
			`auto dim = cast<AffineDimExpr>(expr);`
			`unsigned loopIdx = dim.getPosition();`
			`if (loopIdx < shardingOption.shardingArray.size())`
			`splitAxes[it.index()].append(shardingOption.shardingArray[loopIdx]);`
			`}`

			`// process the partial axes`
			`// partialType will be ignored if partialAxes is empty`
			`Partial partialType = Partial::Sum;`
			`for (auto it : llvm::zip(loopTypes, shardingOption.shardingArray)) {`
			`IteratorType iType = std::get<0>(it);`
			`if (isReductionLoop(iType)) {`
			`Partial curPartialType = getPartialTypeFromReduction(iType);`
			`if (!partialAxes.empty())`
			`assert(partialType == curPartialType &&`
			`"Only one reduction type is supported");`
			`partialType = curPartialType;`
			`const SmallVector<MeshAxis> &axis = std::get<1>(it);`
			`partialAxes.append(axis);`
			`}`
			`}`

			`removeTrailingEmptySubArray(splitAxes);`
			`MeshShardingAttr shardAttr =`
			`MeshShardingAttr::get(b.getContext(), shardingOption.cluster, splitAxes,`
			`partialAxes, partialType);`
			`OpBuilder::InsertionGuard guard(b);`
			`b.setInsertionPointAfterValue(result);`
			`auto shardOp = b.create<ShardOp>(result.getLoc(), resultType, result,`
			`shardAttr, /annotate_for_users/ false);`
			`result.replaceAllUsesExcept(shardOp, shardOp);`
			`return success();`
			`}`

			// To add a `mesh.shard` op for the given operand, based on the details provided
			// in `shardingOption`, `map`, and `loopTypes`.
			`static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,`
			`const ShardingOption &shardingOption,`
			`AffineMap map,`
			`ArrayRef<IteratorType> loopTypes) {`
			`auto maybeShardingAttr = getMeshShardingAttr(opOperand);`
			`if (succeeded(maybeShardingAttr) && maybeShardingAttr->first)`
			`return success();`
			`Value operand = opOperand.get();`
			`auto operandType = operand.getType().cast<RankedTensorType>();`
			`SmallVector<SmallVector<MeshAxis>> splitAxes(operandType.getRank());`
			`unsigned numDims = map.getNumDims();`
			`for (auto it : llvm::enumerate(map.getResults())) {`
			`int64_t idx = it.index();`
			`AffineExpr expr = it.value();`
			`FailureOr<llvm::SmallSet<unsigned, 2>> loopIndices =`
			`checkOperandAffineExpr(expr, numDims);`
			`if (failed(loopIndices))`
			`return failure();`
			`SmallVector<unsigned> shardedLoopIndices;`
			`for (unsigned loopIdx : *loopIndices) {`
			`if ((size_t)loopIdx < shardingOption.shardingArray.size() &&`
			`!shardingOption.shardingArray[loopIdx].empty())`
			`shardedLoopIndices.push_back(loopIdx);`
			`}`
			`// mostly one sharded loop index is accepted`
			`if (shardedLoopIndices.size() > 1)`
			`return failure();`
			`if (shardedLoopIndices.size() == 1) {`
			`splitAxes[idx].append(`
			`shardingOption.shardingArray[shardedLoopIndices[0]]);`
			`}`
			`}`

			`removeTrailingEmptySubArray(splitAxes);`
			`MeshShardingAttr shardAttr =`
			`MeshShardingAttr::get(b.getContext(), shardingOption.cluster, splitAxes);`
			`OpBuilder::InsertionGuard guard(b);`
			`b.setInsertionPoint(opOperand.getOwner());`
			`auto shardOp = b.create<ShardOp>(operand.getLoc(), operandType, operand,`
			`shardAttr, true);`
			`opOperand.set(shardOp);`

			`return success();`
			`}`

			`} // namespace`

			`LogicalResult mesh::detail::defaultAddShardingAnnotations(`
			`Operation *op, OpBuilder &b, const ShardingOption &shardingOption) {`
			`ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);`
			`SmallVector<IteratorType> loopTypes = shardingOp.getLoopIteratorTypes();`
			`SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();`
			`unsigned numOperands = op->getNumOperands();`

			`// 1. add mesh.shard ops for all op results`
			`for (OpResult result : op->getResults()) {`
			`if (failed(addShardOp(b, result, shardingOption,`
			`maps[numOperands + result.getResultNumber()],`
			`loopTypes)))`
			`return failure();`
			`}`

			`// 2. add mesh.shard ops for all operands`
			`for (OpOperand &opOperand : op->getOpOperands()) {`
			`if (failed(addShardOp(b, opOperand, shardingOption,`
			`maps[opOperand.getOperandNumber()], loopTypes)))`
			`return failure();`
			`}`

			`return success();`
			`}`